From b977e52620d7cf9d597fd690e3ab176b6cae79c5 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 6 Mar 2025 13:09:17 -0600 Subject: [PATCH 001/100] properly escape quotes so commands with json will work. --- aiopslab/service/shell.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/aiopslab/service/shell.py b/aiopslab/service/shell.py index 2b894caa..0ea698a7 100644 --- a/aiopslab/service/shell.py +++ b/aiopslab/service/shell.py @@ -22,7 +22,6 @@ def exec(command: str, input_data=None, cwd=None): k8s_host = config.get("k8s_host", "localhost") # Default to localhost if k8s_host == "kind": - print("[INFO] Running command inside kind-control-plane Docker container.") return Shell.docker_exec("kind-control-plane", command) elif k8s_host == "localhost": @@ -95,7 +94,9 @@ def ssh_exec(host: str, user: str, ssh_key_path: str, command: str): @staticmethod def docker_exec(container_name: str, command: str): """Execute a command inside a running Docker container.""" - docker_command = f"docker exec {container_name} sh -c '{command}'" + escaped_command = command.replace('"', '\\"') + + docker_command = f'docker exec {container_name} sh -c "{escaped_command}"' try: out = subprocess.run( From a39de4dfb660d3791134764732b7655c6229fd87 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 6 Mar 2025 13:31:16 -0600 Subject: [PATCH 002/100] This error message isn't for actual errors, so I'm removing it --- aiopslab/service/shell.py | 1 - 1 file changed, 1 deletion(-) diff --git a/aiopslab/service/shell.py b/aiopslab/service/shell.py index 0ea698a7..815589a3 100644 --- a/aiopslab/service/shell.py +++ b/aiopslab/service/shell.py @@ -78,7 +78,6 @@ def ssh_exec(host: str, user: str, ssh_key_path: str, command: str): if exit_status != 0: error_message = stderr.read().decode("utf-8") - print(f"[ERROR] SSH Command execution failed: {error_message}") return error_message else: output_message = stdout.read().decode("utf-8") From fb585e69c5506dbc3296e4117fb4f1ebf18e53bb Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 6 Mar 2025 13:41:21 -0600 Subject: [PATCH 003/100] Added code for onboarding task evaluation --- aiopslab/evaluator.py | 215 +++++++++++++++++++++++++++ aiopslab/orchestrator/eval_parser.py | 170 +++++++++++++++++++++ assessment.py | 132 ++++++++++++++++ 3 files changed, 517 insertions(+) create mode 100644 aiopslab/evaluator.py create mode 100644 aiopslab/orchestrator/eval_parser.py create mode 100644 assessment.py diff --git a/aiopslab/evaluator.py b/aiopslab/evaluator.py new file mode 100644 index 00000000..da0dd6fa --- /dev/null +++ b/aiopslab/evaluator.py @@ -0,0 +1,215 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Orchestrator class that interfaces with the agent and the environment.""" + +from aiopslab.service.helm import Helm +from aiopslab.service.kubectl import KubeCtl +from aiopslab.session import Session +from aiopslab.orchestrator.problems.registry import ProblemRegistry +from aiopslab.orchestrator.eval_parser import EvalParser +from aiopslab.utils.status import * +from aiopslab.service.telemetry.prometheus import Prometheus +import time +import inspect +import asyncio + + +class Evaluator: + def __init__(self): + self.agent = None + self.session = None + self.parser = EvalParser() + self.probs = ProblemRegistry() + self.sprint = SessionPrint() + self.execution_start_time = None + self.execution_end_time = None + self.kubectl = KubeCtl() + + def init_problem(self, problem_id: str): + """Initialize a problem instance for the agent to solve. + + Args: + problem_id (str): The problem instance identifier. + + Returns: + tuple: A tuple containing the problem description, task message, and session object. + """ + # Start timer + self.execution_start_time = time.time() + + self.session = Session() + print(f"Session ID: {self.session.session_id}") + prob = self.probs.get_problem_instance(problem_id) + self.session.set_problem(prob, pid=problem_id) + self.session.set_agent(self.agent_name) + + print("Setting up OpenEBS...") + + command = "kubectl get pods -n openebs" + result = self.kubectl.exec_command(command) + if "Running" in result: + print("OpenEBS is already running. Skipping installation.") + else: + self.kubectl.exec_command( + "kubectl apply -f https://openebs.github.io/charts/openebs-operator.yaml" + ) + self.kubectl.exec_command( + "kubectl patch storageclass openebs-hostpath -p '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"}}}'" + ) + self.kubectl.wait_for_ready("openebs") + print("OpenEBS setup completed.") + + # Setup and deploy Prometheus + self.prometheus = Prometheus() + self.prometheus.deploy() + + # deploy service + prob.app.delete() + prob.app.deploy() + + # inject fault + prob.inject_fault() + + # Check if start_workload is async or sync + if inspect.iscoroutinefunction(prob.start_workload): + asyncio.create_task(prob.start_workload()) + else: + prob.start_workload() + + task_desc = prob.get_task_description() + instructions = prob.get_instructions() + actions = prob.get_available_actions() + + return task_desc, instructions, actions + + def register_agent(self, agent, name="agent"): + """Register the agent for the current session. + + Args: + agent: The agent to register. + name: The name of the agent (default: "agent"). + """ + self.agent = agent + self.agent_name = name + + async def ask_agent(self, input): + """Ask the agent for the next action given the current context.""" + assert self.session is not None + assert self.agent is not None + + agent_response = await self.agent.get_action(input) + self.session.add({"role": "assistant", "content": agent_response}) + + return agent_response + + async def ask_env(self, input): + """Ask the environment for the observation given the current action.""" + assert self.session is not None + + try: + resp = self.parser.parse(input) + except ResponseParsingError as e: + self.session.add({"role": "env", "content": str(e)}) + return str(e) + + api, args, kwargs = resp["api_name"], resp["args"], resp["kwargs"] + + # special handling for submit + if api == "submit": + self.session.set_solution(args[0] if len(args) == 1 else args) + + # Use the problem's eval method to check if solution is valid + try: + # Calculate the current duration manually since session isn't ended yet + current_time = time.time() + current_duration = current_time - self.session.start_time + + # Create a temporary dict to store results + temp_results = self.session.problem.eval( + self.session.solution, + self.session.history, + current_duration + ) + + # Check if the solution is successful based on eval results + if temp_results.get("success", False): + env_response = SubmissionStatus.VALID_SUBMISSION + else: + env_response = SubmissionStatus.INVALID_SUBMISSION + + except Exception as e: + print(f"Error validating submission: {e}") + import traceback + traceback.print_exc() + env_response = SubmissionStatus.INVALID_SUBMISSION + else: + # Regular action handling + try: + env_response = self.session.problem.perform_action(api, *args, **kwargs) + except InvalidActionError as e: + env_response = str(e) + + self.session.add({"role": "env", "content": env_response}) + return env_response + + async def start_problem(self): + """Start the task and run until a valid submission is received. + + Returns: + dict: The final state of the session. + """ + assert self.session is not None + action_instr = "Please take the next action" + action, env_response, results = "", "", {} + self.session.start() + self.execution_start_time = time.time() + + # Initial environment response + env_response = await self.ask_env(action) + + while env_response != SubmissionStatus.VALID_SUBMISSION: + action = await self.ask_agent(action_instr) + self.sprint.agent(action) + + env_response = await self.ask_env(action) + self.sprint.service(env_response) + + if env_response == SubmissionStatus.VALID_SUBMISSION: + break + elif env_response == SubmissionStatus.INVALID_SUBMISSION: + action_instr = "Your submission was invalid. Please continue working on the problem." + else: + action_instr = env_response + + self.session.end() + + # Final evaluation with the valid submission + if env_response == SubmissionStatus.VALID_SUBMISSION: + results = self.session.problem.eval( + self.session.solution, self.session.history, self.session.get_duration() + ) + self.sprint.result(results) + + self.session.set_results(results) + self.session.to_json() + self.session.problem.recover_fault() + + # App cleanup + self.session.problem.app.cleanup() + + self.execution_end_time = time.time() + total_execution_time = self.execution_end_time - self.execution_start_time + time_keys = ["TTD", "TTL", "TTA", "TTM"] + key = next((k for k in time_keys if k in results), None) + framework_overhead = ( + total_execution_time - (results.get(key, 0) or 0) + ) + print(f"Framework overhead: {framework_overhead}") + + return { + "history": self.session.history, + "final_state": env_response, + "results": results, + "framework_overhead": framework_overhead, + } \ No newline at end of file diff --git a/aiopslab/orchestrator/eval_parser.py b/aiopslab/orchestrator/eval_parser.py new file mode 100644 index 00000000..3211aaf3 --- /dev/null +++ b/aiopslab/orchestrator/eval_parser.py @@ -0,0 +1,170 @@ +"""Custom parser for the onboarding task evaluator""" + +import re +import ast + +from aiopslab.utils.status import ResponseParsingError + +class EvalParser: + def __init__(self): + # Define list of known API commands that need special handling + self.known_apis = ["submit"] + + def parse(self, response: str) -> dict: + """Parses the response string to extract the API name and arguments. + + Args: + response (str): The response string (typically an agent's response). + + Returns: + dict: The parsed API name and arguments. + """ + code_block = self.extract_codeblock(response) + context = self.extract_context(response) + + # If there's no code block, check if the response itself is a command + if not code_block: + code_block = response.strip() + + # Check if the code block is a simple "submit" command without parameters + if code_block.strip() == "submit": + return { + "api_name": "submit", + "args": [None], # Placeholder argument + "kwargs": {}, + "context": context, + } + + # Handle other known APIs with function call syntax + if any(code_block.strip().startswith(api + "(") for api in self.known_apis): + api_name = self.parse_api_name(code_block) + args, kwargs = self.parse_args(code_block) + return { + "api_name": api_name, + "args": args, + "kwargs": kwargs, + "context": context, + } + + # Default to exec_shell for unrecognized commands + # Strip any leading/trailing backticks if present + command = code_block.strip("` \n") + return { + "api_name": "exec_shell", + "args": [command], + "kwargs": {}, + "context": context, + } + + def extract_codeblock(self, response: str) -> str: + """Extract a markdown code block from a string. + + Args: + response (str): The response string. + + Returns: + str: The extracted code block. + """ + outputlines = response.split("\n") + indexlines = [i for i, line in enumerate(outputlines) if "```" in line] + if len(indexlines) < 2: + return "" + return "\n".join(outputlines[indexlines[0] + 1 : indexlines[1]]) + + def extract_context(self, response: str) -> list: + """Extract context outside of a code block. + + Args: + response (str): The response string. + + Returns: + list: The extracted context. + """ + pattern = r"(?:```[\s\S]*?```)|(.*?)(?:(?=```)|$)" + matches = re.findall(pattern, response, re.DOTALL) + context = [match.strip() for match in matches if match.strip()] + + return context + + def parse_api_name(self, response: str) -> str: + """Parses the API name from the response function call. + + Args: + response (str): The response string. + + Returns: + str: The API name. + """ + first_parenthesis = response.find("(") + if first_parenthesis != -1: + return response[:first_parenthesis].strip() + return "" + + def parse_args(self, response: str) -> tuple: + """Parses the arguments of a function call. + + Args: + response (str): The response string. + + Returns: + tuple: (args, kwargs) - Lists of positional and keyword arguments. + """ + first_parenthesis = response.find("(") + last_parenthesis = response.rfind(")") + + if first_parenthesis != -1 and last_parenthesis != -1: + args_str = response[first_parenthesis + 1 : last_parenthesis].strip() + + # case: no arguments + if not args_str: + return [], {} + + # case: positional/kwargs handled w/ ast.parse + try: + parsed = ast.parse(f"func({args_str})") + call = parsed.body[0].value + args, kwargs = [], {} + + for arg in call.args: + if isinstance(arg, ast.Constant): + args.append(arg.value) + elif isinstance(arg, (ast.List, ast.Tuple)): + args.append([self.eval_ast_node(elt) for elt in arg.elts]) + elif isinstance(arg, ast.Dict): + args.append( + { + self.eval_ast_node(key): self.eval_ast_node(value) + for key, value in zip(arg.keys, arg.values) + } + ) + else: + args.append(self.eval_ast_node(arg)) + + for kwarg in call.keywords: + kwargs[kwarg.arg] = self.eval_ast_node(kwarg.value) + + return args, kwargs + except Exception as e: + raise ResponseParsingError(f"Error parsing arguments: {str(e)}") + + return [], {} + + def eval_ast_node(self, node): + """Evaluates an AST node to its Python value.""" + if isinstance(node, ast.Constant): + return node.value + elif isinstance(node, ast.List): + return [self.eval_ast_node(elt) for elt in node.elts] + elif isinstance(node, ast.Dict): + return { + self.eval_ast_node(key): self.eval_ast_node(value) + for key, value in zip(node.keys, node.values) + } + elif isinstance(node, ast.Name): + if node.id == "True": + return True + elif node.id == "False": + return False + elif node.id == "None": + return None + raise ValueError(f"Unsupported AST node type: {type(node)}") \ No newline at end of file diff --git a/assessment.py b/assessment.py new file mode 100644 index 00000000..939ae7ff --- /dev/null +++ b/assessment.py @@ -0,0 +1,132 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + + +"""AIOpsLab CLI client.""" + +import asyncio +import json +from prompt_toolkit import PromptSession +from prompt_toolkit.styles import Style +from prompt_toolkit.patch_stdout import patch_stdout +from rich.console import Console +from rich.markdown import Markdown +from rich.panel import Panel +from prompt_toolkit.completion import WordCompleter + +from aiopslab.evaluator import Evaluator + + +WELCOME = """ +# AIOpsLab +- Type your commands or actions below. +- Use `exit` to quit the application. +- Use `start ` to begin a new problem. +""" + +TASK_MESSAGE = """{prob_desc} +You are provided with the following APIs to interact with the service: + +{telemetry_apis} + +You are also provided an API to a secure terminal to the service where you can run commands: + +{shell_api} + +Finally, you will submit your solution for this task using the following API: + +{submit_api} + +At each turn think step-by-step and respond with your action. +""" + + +class HumanAgent: + def __init__(self, orchestrator): + self.session = PromptSession() + self.console = Console(force_terminal=True, color_system="auto") + self.orchestrator = orchestrator + self.pids = self.orchestrator.probs.get_problem_ids() + self.completer = WordCompleter(self.pids, ignore_case=True, match_middle=True) + + def display_welcome_message(self): + self.console.print(Markdown(WELCOME), justify="center") + self.console.print() + + def display_context(self, problem_desc, apis): + self.shell_api = self._filter_dict(apis, lambda k, _: "exec_shell" in k) + self.submit_api = self._filter_dict(apis, lambda k, _: "submit" in k) + self.telemetry_apis = self._filter_dict( + apis, lambda k, _: "exec_shell" not in k and "submit" not in k + ) + + stringify_apis = lambda apis: "\n\n".join( + [f"{k}\n{v}" for k, v in apis.items()] + ) + + self.task_message = TASK_MESSAGE.format( + prob_desc=problem_desc, + telemetry_apis=stringify_apis(self.telemetry_apis), + shell_api=stringify_apis(self.shell_api), + submit_api=stringify_apis(self.submit_api), + ) + + self.console.print(Markdown(self.task_message)) + + def display_env_message(self, env_input): + self.console.print(Panel(env_input, title="Environment", style="white on blue")) + self.console.print() + + async def set_problem(self): + self.init_problem("redeploy_without_PV-mitigation-1") + + async def get_action(self, env_input): + user_input = await self.get_user_input() + template = "Action:```\n{}\n```" + return template.format(user_input) + + def init_problem(self, problem_id="misconfig-mitigation-1"): + problem_desc, _, apis = self.orchestrator.init_problem(problem_id) + self.display_context(problem_desc, apis) + + async def get_user_input(self, completer=None): + loop = asyncio.get_running_loop() + style = Style.from_dict({"prompt": "ansigreen bold"}) + prompt_text = [("class:prompt", "shell> ")] + + with patch_stdout(): + try: + input = await loop.run_in_executor( + None, + lambda: self.session.prompt( + prompt_text, style=style, completer=completer + ), + ) + + if input.lower() == "exit": + raise SystemExit + + return input + except (SystemExit, KeyboardInterrupt, EOFError): + raise SystemExit from None + + def _filter_dict(self, dictionary, filter_func): + return {k: v for k, v in dictionary.items() if filter_func(k, v)} + + +async def main(): + orchestrator = Evaluator() + agent = HumanAgent(orchestrator) + orchestrator.register_agent(agent, name="human") + + agent.display_welcome_message() + await agent.set_problem() + + results = await orchestrator.start_problem() + + with open("results.json", "w") as f: + json.dump(results, f, indent=2) + + +if __name__ == "__main__": + asyncio.run(main()) From 1952aa5fd26958c19edb43748611e0b0ec61cd22 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 6 Mar 2025 13:49:12 -0600 Subject: [PATCH 004/100] Add console logs since we no longer use env --- aiopslab/evaluator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aiopslab/evaluator.py b/aiopslab/evaluator.py index da0dd6fa..fae63fcc 100644 --- a/aiopslab/evaluator.py +++ b/aiopslab/evaluator.py @@ -176,9 +176,10 @@ async def start_problem(self): self.sprint.service(env_response) if env_response == SubmissionStatus.VALID_SUBMISSION: + print("Submission is correct!") break elif env_response == SubmissionStatus.INVALID_SUBMISSION: - action_instr = "Your submission was invalid. Please continue working on the problem." + print("Your submission was invalid. Please continue working on the problem.") else: action_instr = env_response From 2d3d79a77e2882fa9eb9468450c842bea3c89d6c Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 6 Mar 2025 14:00:00 -0600 Subject: [PATCH 005/100] parse results before creating json --- assessment.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/assessment.py b/assessment.py index 939ae7ff..733e87d5 100644 --- a/assessment.py +++ b/assessment.py @@ -124,8 +124,14 @@ async def main(): results = await orchestrator.start_problem() + session_data = orchestrator.session.to_dict() + with open("results.json", "w") as f: - json.dump(results, f, indent=2) + json.dump(session_data, f, indent=2) + + print(f"Results saved to results.json") + + return results if __name__ == "__main__": From 8bcdfaaf23f0be76b84718e647072db204c50d87 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 6 Mar 2025 14:08:14 -0600 Subject: [PATCH 006/100] Get name for results file --- assessment.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/assessment.py b/assessment.py index 733e87d5..e72163b8 100644 --- a/assessment.py +++ b/assessment.py @@ -119,6 +119,8 @@ async def main(): agent = HumanAgent(orchestrator) orchestrator.register_agent(agent, name="human") + first_name = input("What is your first name?: ") + agent.display_welcome_message() await agent.set_problem() @@ -126,10 +128,10 @@ async def main(): session_data = orchestrator.session.to_dict() - with open("results.json", "w") as f: + with open(f"{first_name}_results.json", "w") as f: json.dump(session_data, f, indent=2) - print(f"Results saved to results.json") + print(f"Results saved to {first_name}_results.json") return results From e536280d8ce43c7b8f888e77fd4b43d1ae119d17 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 6 Mar 2025 14:32:56 -0600 Subject: [PATCH 007/100] typo fix --- aiopslab/service/apps/hotelres.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiopslab/service/apps/hotelres.py b/aiopslab/service/apps/hotelres.py index f5e8e355..6ac3e21d 100644 --- a/aiopslab/service/apps/hotelres.py +++ b/aiopslab/service/apps/hotelres.py @@ -75,7 +75,7 @@ def deploy_without_wait(self): print(f"Deploying Kubernetes configurations in namespace: {self.namespace}") self.kubectl.apply_configs(self.namespace, self.k8s_deploy_path) - print(f"Waiting for being stable...") + print(f"Waiting for stability...") time.sleep(30) def delete(self): From ce23d78f257edd5b09a92eb1337e0d8342277fda Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 6 Mar 2025 14:33:43 -0600 Subject: [PATCH 008/100] Add task message for assessment --- assessment.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/assessment.py b/assessment.py index e72163b8..194fe681 100644 --- a/assessment.py +++ b/assessment.py @@ -18,26 +18,23 @@ WELCOME = """ -# AIOpsLab -- Type your commands or actions below. -- Use `exit` to quit the application. -- Use `start ` to begin a new problem. +# AIOpsLab Onboarding Assessment """ -TASK_MESSAGE = """{prob_desc} -You are provided with the following APIs to interact with the service: +TASK_MESSAGE = """\n\n\n\n +There's a problem in the kubernetes cluster in the test-hotel-reservation namespace. -{telemetry_apis} +You need to fix the issue and get all the pods into a ready state. -You are also provided an API to a secure terminal to the service where you can run commands: +You have access to a shell, take whatever action you deem necessary to resolve the issue. -{shell_api} +Once you believe the incident is resolved, run the `submit` command. If your solution is incorrect, it will tell you. -Finally, you will submit your solution for this task using the following API: +You can use any resources you want to complete the assessment except for another person. However, please run all shell commands inside of the interface. -{submit_api} +Your results will be saved in a file called yourFistName_results.json, please email it to jclark58@illinois.edu -At each turn think step-by-step and respond with your action. +If you encounter a bug, send it to jclark58@illinois.edu """ From ccb2620760479e46787f53370661438c8c142163 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 6 Mar 2025 16:42:20 -0600 Subject: [PATCH 009/100] Add hint to task message --- assessment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/assessment.py b/assessment.py index 194fe681..a513da71 100644 --- a/assessment.py +++ b/assessment.py @@ -24,6 +24,8 @@ TASK_MESSAGE = """\n\n\n\n There's a problem in the kubernetes cluster in the test-hotel-reservation namespace. +The issue is that there are unmet PersistentVolumeClaims (PVCs) because of unbound persistent volumes. + You need to fix the issue and get all the pods into a ready state. You have access to a shell, take whatever action you deem necessary to resolve the issue. From 8a53f07976389bec17f5f54f8abb3c5ba2558bd9 Mon Sep 17 00:00:00 2001 From: Yinfang Chen Date: Wed, 19 Mar 2025 18:17:53 -0400 Subject: [PATCH 010/100] rename onboarding related files --- aiopslab/{evaluator.py => onboarding_evaluator.py} | 2 +- .../orchestrator/{eval_parser.py => onboarding_eval_parser.py} | 0 assessment.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename aiopslab/{evaluator.py => onboarding_evaluator.py} (99%) rename aiopslab/orchestrator/{eval_parser.py => onboarding_eval_parser.py} (100%) diff --git a/aiopslab/evaluator.py b/aiopslab/onboarding_evaluator.py similarity index 99% rename from aiopslab/evaluator.py rename to aiopslab/onboarding_evaluator.py index fae63fcc..6eb051c6 100644 --- a/aiopslab/evaluator.py +++ b/aiopslab/onboarding_evaluator.py @@ -7,7 +7,7 @@ from aiopslab.service.kubectl import KubeCtl from aiopslab.session import Session from aiopslab.orchestrator.problems.registry import ProblemRegistry -from aiopslab.orchestrator.eval_parser import EvalParser +from aiopslab.orchestrator.onboarding_eval_parser import EvalParser from aiopslab.utils.status import * from aiopslab.service.telemetry.prometheus import Prometheus import time diff --git a/aiopslab/orchestrator/eval_parser.py b/aiopslab/orchestrator/onboarding_eval_parser.py similarity index 100% rename from aiopslab/orchestrator/eval_parser.py rename to aiopslab/orchestrator/onboarding_eval_parser.py diff --git a/assessment.py b/assessment.py index a513da71..a39635ac 100644 --- a/assessment.py +++ b/assessment.py @@ -14,7 +14,7 @@ from rich.panel import Panel from prompt_toolkit.completion import WordCompleter -from aiopslab.evaluator import Evaluator +from aiopslab.onboarding_evaluator import Evaluator WELCOME = """ From 7cb7ff389c80ba14329d58e6f7c10921bdbcee56 Mon Sep 17 00:00:00 2001 From: Jiaqi Pan <26816351+daklqw@users.noreply.github.com> Date: Wed, 19 Mar 2025 18:18:17 -0500 Subject: [PATCH 011/100] Handle errors from get_traces&get_metrics --- aiopslab/orchestrator/orchestrator.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/aiopslab/orchestrator/orchestrator.py b/aiopslab/orchestrator/orchestrator.py index a1938f1d..4d7d53d0 100644 --- a/aiopslab/orchestrator/orchestrator.py +++ b/aiopslab/orchestrator/orchestrator.py @@ -121,8 +121,15 @@ async def ask_env(self, input): try: env_response = self.session.problem.perform_action(api, *args, **kwargs) + + if hasattr(env_response, "error"): + env_response = str(env_response) + print("An error occurred:", env_response) except InvalidActionError as e: env_response = str(e) + except Exception as e: + env_response = str(e) + print("Unhandled exception:", e) self.session.add({"role": "env", "content": env_response}) From 9c0ef3c938d151c21012bdbb32dff4fbe4b731d1 Mon Sep 17 00:00:00 2001 From: Yinfang Chen Date: Thu, 20 Mar 2025 01:11:11 -0400 Subject: [PATCH 012/100] fix the issue of getting log of the workload pod --- aiopslab/orchestrator/actions/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aiopslab/orchestrator/actions/base.py b/aiopslab/orchestrator/actions/base.py index b623cca5..e232b5b3 100644 --- a/aiopslab/orchestrator/actions/base.py +++ b/aiopslab/orchestrator/actions/base.py @@ -39,6 +39,8 @@ def get_logs(namespace: str, service: str) -> str: user_service_pod = kubectl.get_pod_name( namespace, f"io.kompose.service={service}" ) + elif namespace == "default" and "wrk2-job" in service: + user_service_pod = kubectl.get_pod_name(namespace, f"job-name=wrk2-job") else: raise Exception logs = kubectl.get_pod_logs(user_service_pod, namespace) From c7c44a362c62827c80e28fc27da8275140198c92 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 20 Mar 2025 15:46:32 -0500 Subject: [PATCH 013/100] Add helm as a requirement to README Closes #32 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c78543ec..9ba4df03 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ Moreover, AIOpsLab provides a built-in benchmark suite with a set of problems to ### Requirements - Python >= 3.11 +- [Helm](https://helm.sh/docs/helm/helm_install/) Recommended installation: ```bash From 32bbda8ae5b1f7480b04666400f78006f1716e46 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 20 Mar 2025 23:12:38 -0500 Subject: [PATCH 014/100] Fixed link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9ba4df03..2d43007c 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Moreover, AIOpsLab provides a built-in benchmark suite with a set of problems to ### Requirements - Python >= 3.11 -- [Helm](https://helm.sh/docs/helm/helm_install/) +- [Helm](https://helm.sh/docs/helm/helm_install/](https://helm.sh/) Recommended installation: ```bash From 12db7bca559fc62d78e20b4f13b16c68d898e0c7 Mon Sep 17 00:00:00 2001 From: Yinfang Chen Date: Thu, 20 Mar 2025 23:26:29 -0500 Subject: [PATCH 015/100] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2d43007c..167a5273 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Moreover, AIOpsLab provides a built-in benchmark suite with a set of problems to ### Requirements - Python >= 3.11 -- [Helm](https://helm.sh/docs/helm/helm_install/](https://helm.sh/) +- [Helm](https://helm.sh/) Recommended installation: ```bash From cab9e0a631d93a052f0671c41a6dacbeba5c3aa3 Mon Sep 17 00:00:00 2001 From: Yinfang Chen Date: Thu, 27 Mar 2025 02:09:38 -0500 Subject: [PATCH 016/100] fix return value of read operations --- aiopslab/orchestrator/actions/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiopslab/orchestrator/actions/base.py b/aiopslab/orchestrator/actions/base.py index e232b5b3..79f3d70a 100644 --- a/aiopslab/orchestrator/actions/base.py +++ b/aiopslab/orchestrator/actions/base.py @@ -113,7 +113,7 @@ def read_metrics(file_path: str) -> str: str: The requested metrics or an error message. """ if not os.path.exists(file_path): - return {"error": f"Metrics file '{file_path}' not found."} + return f"error: Metrics file '{file_path}' not found." try: df_metrics = pd.read_csv(file_path) @@ -163,7 +163,7 @@ def read_traces(file_path: str) -> str: str: The requested traces or an error message. """ if not os.path.exists(file_path): - return {"error": f"Traces file '{file_path}' not found."} + return f"error: Traces file '{file_path}' not found." try: df_traces = pd.read_csv(file_path) From e984ed013ceaf42c8d2dfe2d519f3e9266f51a86 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Fri, 28 Mar 2025 15:24:59 -0500 Subject: [PATCH 017/100] Update prometheus interface to use PVC everywhere --- aiopslab/service/telemetry/prometheus.py | 38 ++++++++++++------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/aiopslab/service/telemetry/prometheus.py b/aiopslab/service/telemetry/prometheus.py index 6bced8e9..b4f4c22b 100644 --- a/aiopslab/service/telemetry/prometheus.py +++ b/aiopslab/service/telemetry/prometheus.py @@ -70,13 +70,13 @@ def deploy(self): print("Prometheus is already running. Skipping redeployment.") return - self._delete_pv() + self._delete_pvc() Helm.uninstall(**self.helm_configs) if self.pvc_config_file: - pv_name = self._get_pv_name_from_file(self.pvc_config_file) - if not self._pv_exists(pv_name): - self._apply_pv() + pvc_name = self._get_pvc_name_from_file(self.pvc_config_file) + if not self._pv_exists(pvc_name): + self._apply_pvc() Helm.install(**self.helm_configs) Helm.assert_if_deployed(self.namespace) @@ -86,36 +86,36 @@ def teardown(self): Helm.uninstall(**self.helm_configs) if self.pvc_config_file: - self._delete_pv() + self._delete_pvc() - def _apply_pv(self): - """Apply the PersistentVolume configuration.""" - print(f"Applying PersistentVolume from {self.pvc_config_file}") + def _apply_pvc(self): + """Apply the PersistentVolumeClaim configuration.""" + print(f"Applying PersistentVolumeClaim from {self.pvc_config_file}") KubeCtl().exec_command( f"kubectl apply -f {self.pvc_config_file} -n {self.namespace}" ) - def _delete_pv(self): + def _delete_pvc(self): """Delete the PersistentVolume and associated PersistentVolumeClaim.""" - pv_name = self._get_pv_name_from_file(self.pvc_config_file) - result = KubeCtl().exec_command(f"kubectl get pv {pv_name} --ignore-not-found") + pvc_name = self._get_pv_name_from_file(self.pvc_config_file) + result = KubeCtl().exec_command(f"kubectl get pvc {pvc_name} --ignore-not-found") if result: - print(f"Deleting PersistentVolume {pv_name}") - KubeCtl().exec_command(f"kubectl delete pv {pv_name}") - print(f"Successfully deleted PersistentVolume from {pv_name}") + print(f"Deleting PersistentVolumeClaim {pvc_name}") + KubeCtl().exec_command(f"kubectl delete pv {pvc_name}") + print(f"Successfully deleted PersistentVolumeClaim from {pvc_name}") else: - print(f"PersistentVolume {pv_name} not found. Skipping deletion.") + print(f"PersistentVolumeClaim {pvc_name} not found. Skipping deletion.") def _get_pv_name_from_file(self, pv_config_file): - """Extract PV name from the configuration file.""" + """Extract PVC name from the configuration file.""" with open(pv_config_file, "r") as file: pv_config = yaml.safe_load(file) return pv_config["metadata"]["name"] - def _pv_exists(self, pv_name: str) -> bool: - """Check if the PersistentVolume exists.""" - command = f"kubectl get pv {pv_name}" + def _pvc_exists(self, pv_name: str) -> bool: + """Check if the PersistentVolumeClaim exists.""" + command = f"kubectl get pvc {pv_name}" try: result = KubeCtl().exec_command(command) if "No resources found" in result or "Error" in result: From a858f6b3f659831c38683477d3aa1a07e47d5f1d Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Fri, 28 Mar 2025 15:25:51 -0500 Subject: [PATCH 018/100] teardown openebs and prometheus during cleanup --- aiopslab/orchestrator/orchestrator.py | 29 +++++++++++++++------------ 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/aiopslab/orchestrator/orchestrator.py b/aiopslab/orchestrator/orchestrator.py index 4d7d53d0..fcc96bed 100644 --- a/aiopslab/orchestrator/orchestrator.py +++ b/aiopslab/orchestrator/orchestrator.py @@ -46,19 +46,15 @@ def init_problem(self, problem_id: str): print("Setting up OpenEBS...") - command = "kubectl get pods -n openebs" - result = self.kubectl.exec_command(command) - if "Running" in result: - print("OpenEBS is already running. Skipping installation.") - else: - self.kubectl.exec_command( - "kubectl apply -f https://openebs.github.io/charts/openebs-operator.yaml" - ) - self.kubectl.exec_command( - "kubectl patch storageclass openebs-hostpath -p '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"}}}'" - ) - self.kubectl.wait_for_ready("openebs") - print("OpenEBS setup completed.") + # Install OpenEBS + self.kubectl.exec_command( + "kubectl apply -f https://openebs.github.io/charts/openebs-operator.yaml" + ) + self.kubectl.exec_command( + "kubectl patch storageclass openebs-hostpath -p '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"true\"}}}'" + ) + self.kubectl.wait_for_ready("openebs") + print("OpenEBS setup completed.") # Setup and deploy Prometheus self.prometheus = Prometheus() @@ -181,6 +177,13 @@ async def start_problem(self, max_steps: int): # But this will take more time. # if not self.session.problem.sys_status_after_recovery(): self.session.problem.app.cleanup() + self.prometheus.teardown() + print("Uninstalling OpenEBS...") + self.kubectl.exec_command("kubectl delete sc openebs-hostpath openebs-device --ignore-not-found") + self.kubectl.exec_command( + "kubectl delete -f https://openebs.github.io/charts/openebs-operator.yaml" + ) + self.kubectl.wait_for_namespace_deletion("openebs") self.execution_end_time = time.time() total_execution_time = self.execution_end_time - self.execution_start_time From c3a2ccf6a8c38bce43958fe553c67bf301292958 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Fri, 28 Mar 2025 15:28:29 -0500 Subject: [PATCH 019/100] Fix function name --- aiopslab/service/telemetry/prometheus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiopslab/service/telemetry/prometheus.py b/aiopslab/service/telemetry/prometheus.py index b4f4c22b..288358a1 100644 --- a/aiopslab/service/telemetry/prometheus.py +++ b/aiopslab/service/telemetry/prometheus.py @@ -97,7 +97,7 @@ def _apply_pvc(self): def _delete_pvc(self): """Delete the PersistentVolume and associated PersistentVolumeClaim.""" - pvc_name = self._get_pv_name_from_file(self.pvc_config_file) + pvc_name = self._get_pvc_name_from_file(self.pvc_config_file) result = KubeCtl().exec_command(f"kubectl get pvc {pvc_name} --ignore-not-found") if result: @@ -107,7 +107,7 @@ def _delete_pvc(self): else: print(f"PersistentVolumeClaim {pvc_name} not found. Skipping deletion.") - def _get_pv_name_from_file(self, pv_config_file): + def _get_pvc_name_from_file(self, pv_config_file): """Extract PVC name from the configuration file.""" with open(pv_config_file, "r") as file: pv_config = yaml.safe_load(file) From ccc9c0de5009d632de5af57ebfeefb1522412670 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Fri, 28 Mar 2025 15:36:15 -0500 Subject: [PATCH 020/100] fix function name --- aiopslab/service/telemetry/prometheus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiopslab/service/telemetry/prometheus.py b/aiopslab/service/telemetry/prometheus.py index 288358a1..5937976d 100644 --- a/aiopslab/service/telemetry/prometheus.py +++ b/aiopslab/service/telemetry/prometheus.py @@ -75,7 +75,7 @@ def deploy(self): if self.pvc_config_file: pvc_name = self._get_pvc_name_from_file(self.pvc_config_file) - if not self._pv_exists(pvc_name): + if not self._pvc_exists(pvc_name): self._apply_pvc() Helm.install(**self.helm_configs) From 504f4d913666b46d6cc7220f070c85b549c2ff97 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Fri, 28 Mar 2025 15:52:06 -0500 Subject: [PATCH 021/100] Delete pvc instead of pv --- aiopslab/service/telemetry/prometheus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiopslab/service/telemetry/prometheus.py b/aiopslab/service/telemetry/prometheus.py index 5937976d..d2a50a06 100644 --- a/aiopslab/service/telemetry/prometheus.py +++ b/aiopslab/service/telemetry/prometheus.py @@ -102,7 +102,7 @@ def _delete_pvc(self): if result: print(f"Deleting PersistentVolumeClaim {pvc_name}") - KubeCtl().exec_command(f"kubectl delete pv {pvc_name}") + KubeCtl().exec_command(f"kubectl delete pvc {pvc_name}") print(f"Successfully deleted PersistentVolumeClaim from {pvc_name}") else: print(f"PersistentVolumeClaim {pvc_name} not found. Skipping deletion.") From 02a3e69573ee1d04845e458a1522e62c2551f190 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Fri, 28 Mar 2025 15:52:53 -0500 Subject: [PATCH 022/100] update parameter name --- aiopslab/service/telemetry/prometheus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiopslab/service/telemetry/prometheus.py b/aiopslab/service/telemetry/prometheus.py index d2a50a06..721e659a 100644 --- a/aiopslab/service/telemetry/prometheus.py +++ b/aiopslab/service/telemetry/prometheus.py @@ -113,9 +113,9 @@ def _get_pvc_name_from_file(self, pv_config_file): pv_config = yaml.safe_load(file) return pv_config["metadata"]["name"] - def _pvc_exists(self, pv_name: str) -> bool: + def _pvc_exists(self, pvc_name: str) -> bool: """Check if the PersistentVolumeClaim exists.""" - command = f"kubectl get pvc {pv_name}" + command = f"kubectl get pvc {pvc_name}" try: result = KubeCtl().exec_command(command) if "No resources found" in result or "Error" in result: From 3509ac1492f3bdcdba30daad064310c12cb264e5 Mon Sep 17 00:00:00 2001 From: Trunway Date: Wed, 9 Apr 2025 20:54:54 -0700 Subject: [PATCH 023/100] Fix issue 41 (Fix key error with TTA) Fix issue 41 (Fix key error with TTA) Changed the key "TTR" to "TTA" in aiopslab\orchestrator\tasks\analysis.py --- aiopslab/orchestrator/tasks/analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiopslab/orchestrator/tasks/analysis.py b/aiopslab/orchestrator/tasks/analysis.py index 018deecc..be0116f2 100644 --- a/aiopslab/orchestrator/tasks/analysis.py +++ b/aiopslab/orchestrator/tasks/analysis.py @@ -81,6 +81,6 @@ def perform_action(self, action_name, *args, **kwargs): raise InvalidActionError(action_name) def eval(self, soln: Any, trace: list[SessionItem], duration: float): - self.add_result("TTR", duration) + self.add_result("TTA", duration) self.common_eval(trace) return self.results From 9711138e77990bcdc279f65bde626dba72526024 Mon Sep 17 00:00:00 2001 From: Trunway Date: Wed, 9 Apr 2025 21:11:25 -0700 Subject: [PATCH 024/100] Issue 42 (Cleanup before exit) Injected faults can lead to failure of starting the app next time when the program exits before fault recovery. Add codes to ensure the happening of fault recovery before exit if faults have been injected (exceptions or ctrl+c) --- aiopslab/generators/fault/base.py | 1 - aiopslab/orchestrator/orchestrator.py | 31 +++++++++++++++++++-------- aiopslab/utils/critical_section.py | 23 ++++++++++++++++++++ 3 files changed, 45 insertions(+), 10 deletions(-) create mode 100644 aiopslab/utils/critical_section.py diff --git a/aiopslab/generators/fault/base.py b/aiopslab/generators/fault/base.py index 6122148c..c58dc7f3 100644 --- a/aiopslab/generators/fault/base.py +++ b/aiopslab/generators/fault/base.py @@ -59,7 +59,6 @@ def _recover( self._invoke_method("recover", fault_type, microservices) elif fault_type: self._invoke_method("recover", fault_type) - time.sleep(6) def _invoke_method(self, action_prefix, *args): """helper: injects/recovers faults based on name""" diff --git a/aiopslab/orchestrator/orchestrator.py b/aiopslab/orchestrator/orchestrator.py index 4d7d53d0..995cac05 100644 --- a/aiopslab/orchestrator/orchestrator.py +++ b/aiopslab/orchestrator/orchestrator.py @@ -9,10 +9,12 @@ from aiopslab.orchestrator.problems.registry import ProblemRegistry from aiopslab.orchestrator.parser import ResponseParser from aiopslab.utils.status import * +from aiopslab.utils.critical_section import CriticalSection from aiopslab.service.telemetry.prometheus import Prometheus import time import inspect import asyncio +import atexit class Orchestrator: @@ -68,8 +70,12 @@ def init_problem(self, problem_id: str): prob.app.delete() prob.app.deploy() - # inject fault - prob.inject_fault() + # make sure is_fault_injected is correct to apply appropriate + # function with atexit to recover fault + with CriticalSection(): + # inject fault + prob.inject_fault() + atexit.register(exit_cleanup_fault, prob=prob) # Check if start_workload is async or sync if inspect.iscoroutinefunction(prob.start_workload): @@ -121,7 +127,7 @@ async def ask_env(self, input): try: env_response = self.session.problem.perform_action(api, *args, **kwargs) - + if hasattr(env_response, "error"): env_response = str(env_response) print("An error occurred:", env_response) @@ -174,13 +180,15 @@ async def start_problem(self, max_steps: int): self.session.set_results(results) self.session.to_json() - self.session.problem.recover_fault() - # Beyond recovering from fault, - # I feel sometimes it is safer to delete the whole namespace. - # But this will take more time. - # if not self.session.problem.sys_status_after_recovery(): - self.session.problem.app.cleanup() + with CriticalSection(): + self.session.problem.recover_fault() + # Beyond recovering from fault, + # I feel sometimes it is safer to delete the whole namespace. + # But this will take more time. + # if not self.session.problem.sys_status_after_recovery(): + self.session.problem.app.cleanup() + atexit.unregister(exit_cleanup_fault) self.execution_end_time = time.time() total_execution_time = self.execution_end_time - self.execution_start_time @@ -197,3 +205,8 @@ async def start_problem(self, max_steps: int): "results": results, "framework_overhead": framework_overhead, } + + +def exit_cleanup_fault(prob): + print("Recovering fault before exit...") + prob.recover_fault() diff --git a/aiopslab/utils/critical_section.py b/aiopslab/utils/critical_section.py new file mode 100644 index 00000000..78ad5f22 --- /dev/null +++ b/aiopslab/utils/critical_section.py @@ -0,0 +1,23 @@ +import signal + + +class CriticalSection: + def __enter__(self): + # Save the original signal handler + self.original_handler = signal.signal(signal.SIGINT, self.signal_handler) + self.signaled = False + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # If SIGINT was raised during the critical section, handle it after the block + if self.signaled: + raise KeyboardInterrupt # Re-raise KeyboardInterrupt to exit + + # Restore the original signal handler + signal.signal(signal.SIGINT, self.original_handler) + return False # Do not suppress exceptions + + def signal_handler(self, signum, frame): + """Handle SIGINT by just setting a flag to delay it.""" + self.signaled = True # Flag that SIGINT occurred + print("\nCtrl+C detected! But deferring the effect for consistency...") From 30f4a4245583a25d016becb286d162e79bbe694e Mon Sep 17 00:00:00 2001 From: Jiaqi Pan <26816351+daklqw@users.noreply.github.com> Date: Thu, 10 Apr 2025 14:14:38 -0500 Subject: [PATCH 025/100] This file is not closed --- aiopslab/observer/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aiopslab/observer/__init__.py b/aiopslab/observer/__init__.py index 93267236..44a63f2e 100644 --- a/aiopslab/observer/__init__.py +++ b/aiopslab/observer/__init__.py @@ -12,7 +12,8 @@ root_path = pathlib.Path(__file__).parent sys.path.append(root_path) # read the configuration file -monitor_config = full_load(open(root_path / "monitor_config.yaml", "r")) +with open(root_path / "monitor_config.yaml", "r") as f: + monitor_config = full_load(f) # root_config = full_load(open(root_path / "config.yaml", "r")) From 2cac6fc6f7627222fe02e4659c79e1bb54594862 Mon Sep 17 00:00:00 2001 From: Jiaqi Pan <26816351+daklqw@users.noreply.github.com> Date: Thu, 10 Apr 2025 22:46:09 -0500 Subject: [PATCH 026/100] Fix assign-non-existent-node evaluation issue --- .../assign_non_existent_node_social_net.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiopslab/orchestrator/problems/assign_non_existent_node/assign_non_existent_node_social_net.py b/aiopslab/orchestrator/problems/assign_non_existent_node/assign_non_existent_node_social_net.py index 2f442ea1..1e32af27 100644 --- a/aiopslab/orchestrator/problems/assign_non_existent_node/assign_non_existent_node_social_net.py +++ b/aiopslab/orchestrator/problems/assign_non_existent_node/assign_non_existent_node_social_net.py @@ -191,7 +191,7 @@ def eval(self, soln: Any, trace: list[SessionItem], duration: float) -> dict: # Check if the faulty service exists faulty_service_exists = any( - pod.metadata.name == self.faulty_service for pod in pod_list.items + self.faulty_service in pod.metadata.name for pod in pod_list.items ) if not faulty_service_exists: print(f"Pod named {self.faulty_service} does not exist.") From 40029bb6cdd9cb9dd791ec87aa23e0133c0fc47d Mon Sep 17 00:00:00 2001 From: Jiaqi Pan <26816351+daklqw@users.noreply.github.com> Date: Mon, 14 Apr 2025 15:35:48 -0500 Subject: [PATCH 027/100] Enhances trace processing with error and response tracking --- aiopslab/observer/trace_api.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/aiopslab/observer/trace_api.py b/aiopslab/observer/trace_api.py index 77eabb05..03cfd9a9 100644 --- a/aiopslab/observer/trace_api.py +++ b/aiopslab/observer/trace_api.py @@ -223,22 +223,20 @@ def extract_traces( def process_traces(self, traces) -> pd.DataFrame: """Process raw traces data into a structured DataFrame.""" trace_id_list = [] + span_id_list = [] service_name_list = [] operation_name_list = [] start_time_list = [] duration_list = [] parent_span_list = [] + error_list = [] + response_list = [] for trace in traces: trace_id = trace["traceID"] for span in trace["spans"]: trace_id_list.append(trace_id) - service_name_list.append( - span["serviceName"] - ) # Use the correct service name from the span - operation_name_list.append(span["operationName"]) - start_time_list.append(span["startTime"]) - duration_list.append(span["duration"]) + span_id_list.append(span["spanID"]) parent_span = "ROOT" if "references" in span: for ref in span["references"]: @@ -247,14 +245,35 @@ def process_traces(self, traces) -> pd.DataFrame: break parent_span_list.append(parent_span) + service_name_list.append( + span["serviceName"] + ) # Use the correct service name from the span + operation_name_list.append(span["operationName"]) + start_time_list.append(span["startTime"]) + duration_list.append(span["duration"]) + + has_error = False + response = "Unknown" + for tag in span.get("tags", []): + if tag["key"] == "error" and tag["value"] == True: + has_error = True + if tag["key"] == "http.status_code" or tag["key"] == "response_class": + response = tag["value"] + error_list.append(has_error) + response_list.append(response) + + df = pd.DataFrame( { "trace_id": trace_id_list, + "span_id": span_id_list, + "parent_span": parent_span_list, "service_name": service_name_list, "operation_name": operation_name_list, "start_time": start_time_list, "duration": duration_list, - "parent_span": parent_span_list, + "has_error": error_list, + "response": response_list, } ) return df From 4cacda9739522d5d67e4cccd64474424774aa104 Mon Sep 17 00:00:00 2001 From: Jiaqi Pan <26816351+daklqw@users.noreply.github.com> Date: Mon, 14 Apr 2025 19:14:56 -0500 Subject: [PATCH 028/100] fix service name issue --- aiopslab/observer/trace_api.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/aiopslab/observer/trace_api.py b/aiopslab/observer/trace_api.py index 03cfd9a9..b2ffc102 100644 --- a/aiopslab/observer/trace_api.py +++ b/aiopslab/observer/trace_api.py @@ -211,9 +211,9 @@ def extract_traces( ) for trace in traces: for span in trace["spans"]: - span[ + span["serviceName"] = trace["processes"][span["processID"]][ "serviceName" - ] = service # Directly associate service name with each span + ] all_traces.append(trace) # Collect the trace with service name included self.cleanup() print("Cleanup completed.") @@ -251,7 +251,7 @@ def process_traces(self, traces) -> pd.DataFrame: operation_name_list.append(span["operationName"]) start_time_list.append(span["startTime"]) duration_list.append(span["duration"]) - + has_error = False response = "Unknown" for tag in span.get("tags", []): @@ -261,7 +261,6 @@ def process_traces(self, traces) -> pd.DataFrame: response = tag["value"] error_list.append(has_error) response_list.append(response) - df = pd.DataFrame( { From 24481178708041ba060d5e39fb5508cfc1d3ba96 Mon Sep 17 00:00:00 2001 From: Jiaqi Pan <26816351+daklqw@users.noreply.github.com> Date: Tue, 15 Apr 2025 16:18:24 -0500 Subject: [PATCH 029/100] Task prompts --- .../problems/ad_service_failure/ad_service_failure.py | 1 - .../problems/ad_service_high_cpu/ad_service_high_cpu.py | 1 - .../problems/ad_service_manual_gc/ad_service_manual_gc.py | 1 - .../assign_non_existent_node_social_net.py | 3 --- .../problems/auth_miss_mongodb/auth_miss_mongodb.py | 3 --- .../problems/cart_service_failure/cart_service_failure.py | 1 - .../orchestrator/problems/image_slow_load/image_slow_load.py | 1 - .../problems/k8s_target_port_misconfig/target_port.py | 3 --- .../problems/kafka_queue_problems/kafka_queue_problems.py | 1 - .../loadgenerator_flood_homepage.py | 1 - .../payment_service_failure/payment_service_failure.py | 1 - .../payment_service_unreachable/payment_service_unreachable.py | 1 - .../product_catalog_failure/product_catalog_failure.py | 1 - .../recommendation_service_cache_failure.py | 1 - .../orchestrator/problems/scale_pod/scale_pod_social_net.py | 3 --- aiopslab/orchestrator/tasks/localization.py | 1 + 16 files changed, 1 insertion(+), 23 deletions(-) diff --git a/aiopslab/orchestrator/problems/ad_service_failure/ad_service_failure.py b/aiopslab/orchestrator/problems/ad_service_failure/ad_service_failure.py index da07329d..d0dcc537 100644 --- a/aiopslab/orchestrator/problems/ad_service_failure/ad_service_failure.py +++ b/aiopslab/orchestrator/problems/ad_service_failure/ad_service_failure.py @@ -61,7 +61,6 @@ class AdServiceFailureLocalization(AdServiceFailureBaseTask, LocalizationTask): def __init__(self): AdServiceFailureBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the ad service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/ad_service_high_cpu/ad_service_high_cpu.py b/aiopslab/orchestrator/problems/ad_service_high_cpu/ad_service_high_cpu.py index bf8d75e4..8543f039 100644 --- a/aiopslab/orchestrator/problems/ad_service_high_cpu/ad_service_high_cpu.py +++ b/aiopslab/orchestrator/problems/ad_service_high_cpu/ad_service_high_cpu.py @@ -61,7 +61,6 @@ class AdServiceHighCpuLocalization(AdServiceHighCpuBaseTask, LocalizationTask): def __init__(self): AdServiceHighCpuBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the ad service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/ad_service_manual_gc/ad_service_manual_gc.py b/aiopslab/orchestrator/problems/ad_service_manual_gc/ad_service_manual_gc.py index da3a5ce2..b2e24b08 100644 --- a/aiopslab/orchestrator/problems/ad_service_manual_gc/ad_service_manual_gc.py +++ b/aiopslab/orchestrator/problems/ad_service_manual_gc/ad_service_manual_gc.py @@ -61,7 +61,6 @@ class AdServiceManualGcLocalization(AdServiceManualGcBaseTask, LocalizationTask) def __init__(self): AdServiceManualGcBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the ad service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/assign_non_existent_node/assign_non_existent_node_social_net.py b/aiopslab/orchestrator/problems/assign_non_existent_node/assign_non_existent_node_social_net.py index 1e32af27..58c69d76 100644 --- a/aiopslab/orchestrator/problems/assign_non_existent_node/assign_non_existent_node_social_net.py +++ b/aiopslab/orchestrator/problems/assign_non_existent_node/assign_non_existent_node_social_net.py @@ -93,7 +93,6 @@ class AssignNonExistentNodeSocialNetLocalization( def __init__(self): AssignNonExistentNodeSocialNetBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compost-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") @@ -137,7 +136,6 @@ class AssignNonExistentNodeSocialNetAnalysis( def __init__(self): AssignNonExistentNodeSocialNetBaseTask.__init__(self) AnalysisTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compost-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") @@ -179,7 +177,6 @@ class AssignNonExistentNodeSocialNetMitigation( def __init__(self): AssignNonExistentNodeSocialNetBaseTask.__init__(self) MitigationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compost-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float) -> dict: print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/auth_miss_mongodb/auth_miss_mongodb.py b/aiopslab/orchestrator/problems/auth_miss_mongodb/auth_miss_mongodb.py index 6fe3bc52..a07f2c2c 100644 --- a/aiopslab/orchestrator/problems/auth_miss_mongodb/auth_miss_mongodb.py +++ b/aiopslab/orchestrator/problems/auth_miss_mongodb/auth_miss_mongodb.py @@ -86,7 +86,6 @@ class MongoDBAuthMissingLocalization(MongoDBAuthMissingBaseTask, LocalizationTas def __init__(self): MongoDBAuthMissingBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compose-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") @@ -128,7 +127,6 @@ class MongoDBAuthMissingAnalysis(MongoDBAuthMissingBaseTask, AnalysisTask): def __init__(self): MongoDBAuthMissingBaseTask.__init__(self) AnalysisTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compose-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") @@ -169,7 +167,6 @@ class MongoDBAuthMissingMitigation(MongoDBAuthMissingBaseTask, MitigationTask): def __init__(self): MongoDBAuthMissingBaseTask.__init__(self) MitigationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compose-post-service` pod" # TODO: this migigate eval should be a bit different. # The error will not be on the container/pod level but the app level, diff --git a/aiopslab/orchestrator/problems/cart_service_failure/cart_service_failure.py b/aiopslab/orchestrator/problems/cart_service_failure/cart_service_failure.py index e65a9c11..15587bd9 100644 --- a/aiopslab/orchestrator/problems/cart_service_failure/cart_service_failure.py +++ b/aiopslab/orchestrator/problems/cart_service_failure/cart_service_failure.py @@ -61,7 +61,6 @@ class CartServiceFailureLocalization(CartServiceFailureBaseTask, LocalizationTas def __init__(self): CartServiceFailureBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the cart service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/image_slow_load/image_slow_load.py b/aiopslab/orchestrator/problems/image_slow_load/image_slow_load.py index df36db3c..7d075be1 100644 --- a/aiopslab/orchestrator/problems/image_slow_load/image_slow_load.py +++ b/aiopslab/orchestrator/problems/image_slow_load/image_slow_load.py @@ -61,7 +61,6 @@ class ImageSlowLoadLocalization(ImageSlowLoadBaseTask, LocalizationTask): def __init__(self): ImageSlowLoadBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the frontend service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/k8s_target_port_misconfig/target_port.py b/aiopslab/orchestrator/problems/k8s_target_port_misconfig/target_port.py index 16467783..59d0d9cc 100644 --- a/aiopslab/orchestrator/problems/k8s_target_port_misconfig/target_port.py +++ b/aiopslab/orchestrator/problems/k8s_target_port_misconfig/target_port.py @@ -88,7 +88,6 @@ class K8STargetPortMisconfigLocalization( def __init__(self, faulty_service: str = "user-service"): K8STargetPortMisconfigBaseTask.__init__(self, faulty_service=faulty_service) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compose-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") @@ -133,7 +132,6 @@ class K8STargetPortMisconfigAnalysis(K8STargetPortMisconfigBaseTask, AnalysisTas def __init__(self, faulty_service: str = "user-service"): K8STargetPortMisconfigBaseTask.__init__(self, faulty_service=faulty_service) AnalysisTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compose-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") @@ -167,7 +165,6 @@ class K8STargetPortMisconfigMitigation(K8STargetPortMisconfigBaseTask, Mitigatio def __init__(self, faulty_service: str = "user-service"): K8STargetPortMisconfigBaseTask.__init__(self, faulty_service=faulty_service) MitigationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compose-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float) -> dict: print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/kafka_queue_problems/kafka_queue_problems.py b/aiopslab/orchestrator/problems/kafka_queue_problems/kafka_queue_problems.py index 74bebf28..13e137dc 100644 --- a/aiopslab/orchestrator/problems/kafka_queue_problems/kafka_queue_problems.py +++ b/aiopslab/orchestrator/problems/kafka_queue_problems/kafka_queue_problems.py @@ -61,7 +61,6 @@ class KafkaQueueProblemsLocalization(KafkaQueueProblemsBaseTask, LocalizationTas def __init__(self): KafkaQueueProblemsBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the kafka service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/loadgenerator_flood_homepage/loadgenerator_flood_homepage.py b/aiopslab/orchestrator/problems/loadgenerator_flood_homepage/loadgenerator_flood_homepage.py index 3c7e79ef..b324b45a 100644 --- a/aiopslab/orchestrator/problems/loadgenerator_flood_homepage/loadgenerator_flood_homepage.py +++ b/aiopslab/orchestrator/problems/loadgenerator_flood_homepage/loadgenerator_flood_homepage.py @@ -66,7 +66,6 @@ class LoadGeneratorFloodHomepageLocalization( def __init__(self): LoadGeneratorFloodHomepageBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the frontend service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/payment_service_failure/payment_service_failure.py b/aiopslab/orchestrator/problems/payment_service_failure/payment_service_failure.py index 8352068b..6d56f8ef 100644 --- a/aiopslab/orchestrator/problems/payment_service_failure/payment_service_failure.py +++ b/aiopslab/orchestrator/problems/payment_service_failure/payment_service_failure.py @@ -63,7 +63,6 @@ class PaymentServiceFailureLocalization( def __init__(self): PaymentServiceFailureBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the payment service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/payment_service_unreachable/payment_service_unreachable.py b/aiopslab/orchestrator/problems/payment_service_unreachable/payment_service_unreachable.py index 529bc1c8..3c1b2cf1 100644 --- a/aiopslab/orchestrator/problems/payment_service_unreachable/payment_service_unreachable.py +++ b/aiopslab/orchestrator/problems/payment_service_unreachable/payment_service_unreachable.py @@ -65,7 +65,6 @@ class PaymentServiceUnreachableLocalization( def __init__(self): PaymentServiceUnreachableBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the payment service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/product_catalog_failure/product_catalog_failure.py b/aiopslab/orchestrator/problems/product_catalog_failure/product_catalog_failure.py index a9099218..6521cd5d 100644 --- a/aiopslab/orchestrator/problems/product_catalog_failure/product_catalog_failure.py +++ b/aiopslab/orchestrator/problems/product_catalog_failure/product_catalog_failure.py @@ -65,7 +65,6 @@ class ProductCatalogServiceFailureLocalization( def __init__(self): ProductCatalogServiceFailureBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the product catalog service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/recommendation_service_cache_failure/recommendation_service_cache_failure.py b/aiopslab/orchestrator/problems/recommendation_service_cache_failure/recommendation_service_cache_failure.py index 4827bb83..1e0dd261 100644 --- a/aiopslab/orchestrator/problems/recommendation_service_cache_failure/recommendation_service_cache_failure.py +++ b/aiopslab/orchestrator/problems/recommendation_service_cache_failure/recommendation_service_cache_failure.py @@ -67,7 +67,6 @@ class RecommendationServiceCacheFailureLocalization( def __init__(self): RecommendationServiceCacheFailureBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the reccomendation service." def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") diff --git a/aiopslab/orchestrator/problems/scale_pod/scale_pod_social_net.py b/aiopslab/orchestrator/problems/scale_pod/scale_pod_social_net.py index 3077dc53..e9672e43 100644 --- a/aiopslab/orchestrator/problems/scale_pod/scale_pod_social_net.py +++ b/aiopslab/orchestrator/problems/scale_pod/scale_pod_social_net.py @@ -92,7 +92,6 @@ class ScalePodSocialNetLocalization(ScalePodSocialNetBaseTask, LocalizationTask) def __init__(self): ScalePodSocialNetBaseTask.__init__(self) LocalizationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compost-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") @@ -134,7 +133,6 @@ class ScalePodSocialNetAnalysis(ScalePodSocialNetBaseTask, AnalysisTask): def __init__(self): ScalePodSocialNetBaseTask.__init__(self) AnalysisTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compost-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float): print("== Evaluation ==") @@ -174,7 +172,6 @@ class ScalePodSocialNetMitigation(ScalePodSocialNetBaseTask, MitigationTask): def __init__(self): ScalePodSocialNetBaseTask.__init__(self) MitigationTask.__init__(self, self.app) - self.task_desc += "Start by investigating the `compost-post-service` pod" def eval(self, soln: Any, trace: list[SessionItem], duration: float) -> dict: print("== Evaluation ==") diff --git a/aiopslab/orchestrator/tasks/localization.py b/aiopslab/orchestrator/tasks/localization.py index d02a7c85..0685b638 100644 --- a/aiopslab/orchestrator/tasks/localization.py +++ b/aiopslab/orchestrator/tasks/localization.py @@ -31,6 +31,7 @@ def __init__(self, app: Application): The service you are working with today is described below: {app_summary} + You are requested to identify the ROOT CAUSE of the fault in the system. You should find the services which contain the root cause of the fault. You will begin by analyzing the service's state and telemetry, and then submit one of two possible solutions: 1. list[str]: list of faulty components (i.e., service names) 2. str: `None` if no faults were detected From 3ac8379068c3882d0089b77fc42f1e45f688dc4e Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Wed, 16 Apr 2025 11:27:02 -0500 Subject: [PATCH 030/100] Update README.md Fixed some weird wording in the README. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 167a5273..998a3291 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ If you're running into issues, consider building a Docker image for your machine After finishing cluster creation, proceed to the next "Update `config.yml`" step. ### b) Remote cluster -AIOpsLab supports any remote kubernetes cluster that your `kubectl` context is set to, whether it's a cluster from a cloud provider or one you build yourself. We have some Ansible playbooks we have to setup clusters on providers like [CloudLab](https://www.cloudlab.us/) and our own machines. Follow this [README](./scripts/ansible/README.md) to set up your own cluster, and then proceed to the next "Update `config.yml`" step. +AIOpsLab supports any remote kubernetes cluster that your `kubectl` context is set to, whether it's a cluster from a cloud provider or one you build yourself. We have some Ansible playbooks to setup clusters on providers like [CloudLab](https://www.cloudlab.us/) and our own machines. Follow this [README](./scripts/ansible/README.md) to set up your own cluster, and then proceed to the next "Update `config.yml`" step. ### Update `config.yml` ```bash From b86a6866115cc5883811530e2035b6767b70d8e7 Mon Sep 17 00:00:00 2001 From: Jiaqi Pan <26816351+daklqw@users.noreply.github.com> Date: Wed, 16 Apr 2025 15:58:26 -0500 Subject: [PATCH 031/100] Polish prompt --- aiopslab/orchestrator/tasks/localization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiopslab/orchestrator/tasks/localization.py b/aiopslab/orchestrator/tasks/localization.py index 0685b638..cce76149 100644 --- a/aiopslab/orchestrator/tasks/localization.py +++ b/aiopslab/orchestrator/tasks/localization.py @@ -31,7 +31,7 @@ def __init__(self, app: Application): The service you are working with today is described below: {app_summary} - You are requested to identify the ROOT CAUSE of the fault in the system. You should find the services which contain the root cause of the fault. + You are requested to identify the service(s) where the root cause of the fault lies. You will begin by analyzing the service's state and telemetry, and then submit one of two possible solutions: 1. list[str]: list of faulty components (i.e., service names) 2. str: `None` if no faults were detected From 0dca58ae7db25c4aee7ea5ae171252b252404db7 Mon Sep 17 00:00:00 2001 From: Trunway Date: Thu, 17 Apr 2025 00:47:46 -0700 Subject: [PATCH 032/100] Recover fault before users catch exceptions Users can catch exceptions and make program not exit. In this case, functions registered with atexit will not work to recover fault. We catch exceptions before users and recover fault in "except" part to solve this problem. --- aiopslab/orchestrator/orchestrator.py | 30 ++++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/aiopslab/orchestrator/orchestrator.py b/aiopslab/orchestrator/orchestrator.py index 995cac05..7ec6f6c7 100644 --- a/aiopslab/orchestrator/orchestrator.py +++ b/aiopslab/orchestrator/orchestrator.py @@ -155,19 +155,29 @@ async def start_problem(self, max_steps: int): action, env_response, results = "", "", {} self.session.start() - for step in range(max_steps): - action = await self.ask_agent(action_instr) - self.sprint.agent(action) + # catch any exception and recover fault before the users catch it + try: + for step in range(max_steps): + action = await self.ask_agent(action_instr) + self.sprint.agent(action) - env_response = await self.ask_env(action) - self.sprint.service(env_response) + env_response = await self.ask_env(action) + self.sprint.service(env_response) - if env_response == SubmissionStatus.VALID_SUBMISSION: - break - elif env_response == SubmissionStatus.INVALID_SUBMISSION: - raise ValueError("Invalid submission!") # TODO (@manish): ask to retry? + if env_response == SubmissionStatus.VALID_SUBMISSION: + break + elif env_response == SubmissionStatus.INVALID_SUBMISSION: + raise ValueError("Invalid submission!") # TODO (@manish): ask to retry? - action_instr = env_response + "\n" + "Please take the next action" + action_instr = env_response + "\n" + "Please take the next action" + except Exception as e: + # Make sure the fault cleanup function is unregistered + # after recovering fault ahead because of exceptions + with CriticalSection(): + print("Some exception happened. Recovering the injected fault...") + self.session.problem.recover_fault() + atexit.unregister(exit_cleanup_fault) + raise e self.session.end() From 0741e0cb75ca979843170c88d247c4dedb205945 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 20:10:59 -0500 Subject: [PATCH 033/100] Add remote chart parameter --- aiopslab/service/helm.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/aiopslab/service/helm.py b/aiopslab/service/helm.py index c3efb2ca..2cc2c8f6 100644 --- a/aiopslab/service/helm.py +++ b/aiopslab/service/helm.py @@ -20,6 +20,7 @@ def install(**args): namespace (str): Namespace to install the chart version (str): Version of the chart extra_args (List[str)]: Extra arguments for the helm install command + remote_chart (bool): Whether the chart is remote (from a Helm repo) """ print("== Helm Install ==") release_name = args.get("release_name") @@ -27,16 +28,18 @@ def install(**args): namespace = args.get("namespace") version = args.get("version") extra_args = args.get("extra_args") - - # Install dependencies for chart before installation - dependency_command = f"helm dependency update {chart_path}" - dependency_process = subprocess.Popen( - dependency_command, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - dependency_output, dependency_error = dependency_process.communicate() + remote_chart = args.get("remote_chart", False) + + if not remote_chart: + # Install dependencies for chart before installation + dependency_command = f"helm dependency update {chart_path}" + dependency_process = subprocess.Popen( + dependency_command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + dependency_output, dependency_error = dependency_process.communicate() command = f"helm install {release_name} {chart_path} -n {namespace} --create-namespace" From 82ed33f6300bbdbe09db4e3de79b5efed9961903 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 20:11:12 -0500 Subject: [PATCH 034/100] Switch to remote chart --- aiopslab/service/metadata/astronomy-shop.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/aiopslab/service/metadata/astronomy-shop.json b/aiopslab/service/metadata/astronomy-shop.json index a7881a20..617ac969 100644 --- a/aiopslab/service/metadata/astronomy-shop.json +++ b/aiopslab/service/metadata/astronomy-shop.json @@ -15,7 +15,8 @@ ], "Helm Config": { "release_name": "astronomy-shop", - "chart_path": "opentelemetry-helm-charts/charts/opentelemetry-demo", - "namespace": "astronomy-shop" + "chart_path": "open-telemetry/opentelemetry-demo", + "namespace": "astronomy-shop", + "remote_chart": true } } \ No newline at end of file From 37ad626098f4c57095d982ed04c17a814698ad37 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 20:11:32 -0500 Subject: [PATCH 035/100] nit: no need to print every loop iteration. --- aiopslab/service/kubectl.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/aiopslab/service/kubectl.py b/aiopslab/service/kubectl.py index 93fcc829..8b88ae33 100644 --- a/aiopslab/service/kubectl.py +++ b/aiopslab/service/kubectl.py @@ -80,9 +80,7 @@ def wait_for_ready(self, namespace, sleep=2, max_wait=300): try: pod_list = self.list_pods(namespace) - if not pod_list.items: - console.log(f"[yellow]No pods found in namespace '{namespace}', waiting...") - else: + if pod_list.items: ready_pods = [ pod for pod in pod_list.items if pod.status.container_statuses and From 4dc7c8eae4e7bb5f1c8c6571078b58e5230adbfb Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 20:11:41 -0500 Subject: [PATCH 036/100] Remove local path --- aiopslab/service/apps/astronomy_shop.py | 1 - 1 file changed, 1 deletion(-) diff --git a/aiopslab/service/apps/astronomy_shop.py b/aiopslab/service/apps/astronomy_shop.py index 68b81e09..264711c9 100644 --- a/aiopslab/service/apps/astronomy_shop.py +++ b/aiopslab/service/apps/astronomy_shop.py @@ -5,7 +5,6 @@ from aiopslab.service.helm import Helm from aiopslab.service.kubectl import KubeCtl from aiopslab.service.apps.base import Application -from aiopslab.paths import TARGET_MICROSERVICES from aiopslab.paths import ASTRONOMY_SHOP_METADATA From ec699118b8f18d628810793e0a5fcd7e307145a3 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 20:15:14 -0500 Subject: [PATCH 037/100] Only use TARGET_MICROSERVICES for local charts --- aiopslab/service/apps/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/aiopslab/service/apps/base.py b/aiopslab/service/apps/base.py index 214a8e38..02a30b15 100644 --- a/aiopslab/service/apps/base.py +++ b/aiopslab/service/apps/base.py @@ -28,8 +28,9 @@ def load_app_json(self): self.namespace = metadata["Namespace"] if "Helm Config" in metadata: self.helm_configs = metadata["Helm Config"] - if "chart_path" in self.helm_configs: - chart_path = self.helm_configs["chart_path"] + chart_path = self.helm_configs.get("chart_path") + + if chart_path and not self.helm_configs.get("remote_chart", False): self.helm_configs["chart_path"] = str(TARGET_MICROSERVICES / chart_path) if "K8S Deploy Path" in metadata: From a380b1f1a2c6c8df6e585c08d2923abe99a6e15d Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 20:51:58 -0500 Subject: [PATCH 038/100] Fix configmap name --- aiopslab/generators/fault/inject_otel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiopslab/generators/fault/inject_otel.py b/aiopslab/generators/fault/inject_otel.py index 5b1bae40..8c733626 100644 --- a/aiopslab/generators/fault/inject_otel.py +++ b/aiopslab/generators/fault/inject_otel.py @@ -8,7 +8,7 @@ class OtelFaultInjector(FaultInjector): def __init__(self, namespace: str): self.namespace = namespace self.kubectl = KubeCtl() - self.configmap_name = f"{namespace}-flagd-config" + self.configmap_name = "flagd-config" def inject_fault(self, feature_flag: str): command = ( From d89a947aaa80e294c97610d3bac2ab9f6a609d78 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 20:58:54 -0500 Subject: [PATCH 039/100] Fix ad service failure name from configmap. --- .../problems/ad_service_failure/ad_service_failure.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiopslab/orchestrator/problems/ad_service_failure/ad_service_failure.py b/aiopslab/orchestrator/problems/ad_service_failure/ad_service_failure.py index d0dcc537..ba73f0b6 100644 --- a/aiopslab/orchestrator/problems/ad_service_failure/ad_service_failure.py +++ b/aiopslab/orchestrator/problems/ad_service_failure/ad_service_failure.py @@ -24,12 +24,12 @@ def start_workload(self): def inject_fault(self): print("== Fault Injection ==") - self.injector.inject_fault("adServiceFailure") + self.injector.inject_fault("adFailure") print(f"Fault: adServiceFailure | Namespace: {self.namespace}\n") def recover_fault(self): print("== Fault Recovery ==") - self.injector.recover_fault("adServiceFailure") + self.injector.recover_fault("adFailure") ################## Detection Problem ################## From 943ed0bd7dd62ccfe21ac71f42f8f097ec163947 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 21:14:58 -0500 Subject: [PATCH 040/100] Restart so changes take effect --- aiopslab/generators/fault/inject_otel.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/aiopslab/generators/fault/inject_otel.py b/aiopslab/generators/fault/inject_otel.py index 8c733626..2f1887b1 100644 --- a/aiopslab/generators/fault/inject_otel.py +++ b/aiopslab/generators/fault/inject_otel.py @@ -39,6 +39,11 @@ def inject_fault(self, feature_flag: str): self.kubectl.create_or_update_configmap( self.configmap_name, self.namespace, updated_data ) + + self.kubectl.exec_command( + f"kubectl rollout restart deployment flagd -n {self.namespace}" + ) + print(f"Fault injected: Feature flag '{feature_flag}' set to 'on'.") def recover_fault(self, feature_flag: str): @@ -70,6 +75,10 @@ def recover_fault(self, feature_flag: str): self.kubectl.create_or_update_configmap( self.configmap_name, self.namespace, updated_data ) + + self.kubectl.exec_command( + f"kubectl rollout restart deployment flagd -n {self.namespace}" + ) print(f"Fault recovered: Feature flag '{feature_flag}' set to 'off'.") From 578d9513ad47f2acb8d056bcd411f8e8dea27b2e Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 21:22:51 -0500 Subject: [PATCH 041/100] Update name. --- .../problems/ad_service_high_cpu/ad_service_high_cpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiopslab/orchestrator/problems/ad_service_high_cpu/ad_service_high_cpu.py b/aiopslab/orchestrator/problems/ad_service_high_cpu/ad_service_high_cpu.py index 8543f039..4aef1c32 100644 --- a/aiopslab/orchestrator/problems/ad_service_high_cpu/ad_service_high_cpu.py +++ b/aiopslab/orchestrator/problems/ad_service_high_cpu/ad_service_high_cpu.py @@ -24,12 +24,12 @@ def start_workload(self): def inject_fault(self): print("== Fault Injection ==") - self.injector.inject_fault("adServiceHighCpu") + self.injector.inject_fault("adHighCpu") print(f"Fault: AdServiceHighCpu | Namespace: {self.namespace}\n") def recover_fault(self): print("== Fault Recovery ==") - self.injector.recover_fault("adServiceHighCpu") + self.injector.recover_fault("adHighCpu") ################## Detection Problem ################## From 740a4ae61335a8d4883863d535475f381086d4bc Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 21:27:50 -0500 Subject: [PATCH 042/100] Fix name to match configmap --- .../problems/ad_service_manual_gc/ad_service_manual_gc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiopslab/orchestrator/problems/ad_service_manual_gc/ad_service_manual_gc.py b/aiopslab/orchestrator/problems/ad_service_manual_gc/ad_service_manual_gc.py index b2e24b08..e5f24b68 100644 --- a/aiopslab/orchestrator/problems/ad_service_manual_gc/ad_service_manual_gc.py +++ b/aiopslab/orchestrator/problems/ad_service_manual_gc/ad_service_manual_gc.py @@ -24,12 +24,12 @@ def start_workload(self): def inject_fault(self): print("== Fault Injection ==") - self.injector.inject_fault("adServiceManualGc") + self.injector.inject_fault("adManualGc") print(f"Fault: adServiceManualGc | Namespace: {self.namespace}\n") def recover_fault(self): print("== Fault Recovery ==") - self.injector.recover_fault("adServiceManualGc") + self.injector.recover_fault("adManualGc") ################## Detection Problem ################## From 487f5222f6061c36dde2a0fc8ac21a8ba013547f Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 21:29:27 -0500 Subject: [PATCH 043/100] Fix cm name --- .../recommendation_service_cache_failure.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiopslab/orchestrator/problems/recommendation_service_cache_failure/recommendation_service_cache_failure.py b/aiopslab/orchestrator/problems/recommendation_service_cache_failure/recommendation_service_cache_failure.py index 1e0dd261..6073c8ac 100644 --- a/aiopslab/orchestrator/problems/recommendation_service_cache_failure/recommendation_service_cache_failure.py +++ b/aiopslab/orchestrator/problems/recommendation_service_cache_failure/recommendation_service_cache_failure.py @@ -24,14 +24,14 @@ def start_workload(self): def inject_fault(self): print("== Fault Injection ==") - self.injector.inject_fault("recommendationServiceCacheFailure") + self.injector.inject_fault("recommendationCacheFailure") print( f"Fault: recommendationServiceCacheFailure | Namespace: {self.namespace}\n" ) def recover_fault(self): print("== Fault Recovery ==") - self.injector.recover_fault("recommendationServiceCacheFailure") + self.injector.recover_fault("recommendationCacheFailure") ################## Detection Problem ################## From a2d65b5a486a89e00ea31ddceed6c9585e841902 Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 21:30:14 -0500 Subject: [PATCH 044/100] change to cartFailure --- .../problems/cart_service_failure/cart_service_failure.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiopslab/orchestrator/problems/cart_service_failure/cart_service_failure.py b/aiopslab/orchestrator/problems/cart_service_failure/cart_service_failure.py index 15587bd9..6575248e 100644 --- a/aiopslab/orchestrator/problems/cart_service_failure/cart_service_failure.py +++ b/aiopslab/orchestrator/problems/cart_service_failure/cart_service_failure.py @@ -24,12 +24,12 @@ def start_workload(self): def inject_fault(self): print("== Fault Injection ==") - self.injector.inject_fault("cartServiceFailure") + self.injector.inject_fault("cartFailure") print(f"Fault: cartServiceFailure | Namespace: {self.namespace}\n") def recover_fault(self): print("== Fault Recovery ==") - self.injector.recover_fault("cartServiceFailure") + self.injector.recover_fault("cartFailure") ################## Detection Problem ################## From a01abada3559d297176c70afff2a8cfb602323ad Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 21:30:40 -0500 Subject: [PATCH 045/100] update cm name --- .../payment_service_failure/payment_service_failure.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiopslab/orchestrator/problems/payment_service_failure/payment_service_failure.py b/aiopslab/orchestrator/problems/payment_service_failure/payment_service_failure.py index 6d56f8ef..b427f198 100644 --- a/aiopslab/orchestrator/problems/payment_service_failure/payment_service_failure.py +++ b/aiopslab/orchestrator/problems/payment_service_failure/payment_service_failure.py @@ -24,12 +24,12 @@ def start_workload(self): def inject_fault(self): print("== Fault Injection ==") - self.injector.inject_fault("paymentServiceFailure") + self.injector.inject_fault("paymentFailure") print(f"Fault: paymentServiceFailure | Namespace: {self.namespace}\n") def recover_fault(self): print("== Fault Recovery ==") - self.injector.recover_fault("paymentServiceFailure") + self.injector.recover_fault("paymentFailure") ################## Detection Problem ################## From 35c9236000e6f78ff9919b1f1f3d21ee1f8c4ebc Mon Sep 17 00:00:00 2001 From: Jackson Clark Date: Thu, 17 Apr 2025 21:31:04 -0500 Subject: [PATCH 046/100] Fix names --- .../payment_service_unreachable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aiopslab/orchestrator/problems/payment_service_unreachable/payment_service_unreachable.py b/aiopslab/orchestrator/problems/payment_service_unreachable/payment_service_unreachable.py index 3c1b2cf1..cc6b7702 100644 --- a/aiopslab/orchestrator/problems/payment_service_unreachable/payment_service_unreachable.py +++ b/aiopslab/orchestrator/problems/payment_service_unreachable/payment_service_unreachable.py @@ -24,12 +24,12 @@ def start_workload(self): def inject_fault(self): print("== Fault Injection ==") - self.injector.inject_fault("paymentServiceUnreachable") + self.injector.inject_fault("paymentUnreachable") print(f"Fault: paymentServiceUnreachable | Namespace: {self.namespace}\n") def recover_fault(self): print("== Fault Recovery ==") - self.injector.recover_fault("paymentServiceUnreachable") + self.injector.recover_fault("paymentUnreachable") ################## Detection Problem ################## From f8f83745dc475ec8e4fa0ac602cec4dd98eb3263 Mon Sep 17 00:00:00 2001 From: Warren Yeh Date: Mon, 21 Apr 2025 01:02:08 +0800 Subject: [PATCH 047/100] Add proxy configuration tips for AIOpsLab users --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 998a3291..1ee1764e 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,19 @@ kind create cluster --config kind/kind-config-arm.yaml If you're running into issues, consider building a Docker image for your machine by following this [README](kind/README.md). Please also open an issue. +### [Tips] +If you are running AIOpsLab using a proxy, beware of exporting the HTTP proxy as `172.17.0.1`. When creating the kind cluster, all the nodes in the cluster will inherit the proxy setting from the host environment and the Docker container. + +The `172.17.0.1` address is used to communicate with the host machine. For more details, refer to the official guide: [Configure Kind to Use a Proxy](https://kind.sigs.k8s.io/docs/user/quick-start/#configure-kind-to-use-a-proxy). + +Additionally, Docker doesn't support SOCKS5 proxy directly. If you're using a SOCKS5 protocol to proxy, you may need to use [Privoxy](https://www.privoxy.org) to forward SOCKS5 to HTTP. + +If you're running VLLM and the LLM agent locally, Privoxy will by default proxy `localhost`, which will cause errors. To avoid this issue, you should set the following environment variable: + +```bash +export no_proxy=localhost +``` + After finishing cluster creation, proceed to the next "Update `config.yml`" step. ### b) Remote cluster From 67a0e7471e6e90929e5c6b106fbde47277d5f040 Mon Sep 17 00:00:00 2001 From: Warren Yeh Date: Mon, 21 Apr 2025 13:00:24 +0800 Subject: [PATCH 048/100] Refine task descriptions and instructions for smaller models to ensure format consistency, reducing errors and improving compliance with required output structure --- aiopslab/orchestrator/actions/detection.py | 2 +- aiopslab/orchestrator/tasks/analysis.py | 27 ++++++++++++------- aiopslab/orchestrator/tasks/detection.py | 29 ++++++++++++--------- aiopslab/orchestrator/tasks/localization.py | 29 +++++++++++++-------- aiopslab/orchestrator/tasks/mitigation.py | 25 +++++++++++------- clients/utils/templates.py | 6 ++--- 6 files changed, 71 insertions(+), 47 deletions(-) diff --git a/aiopslab/orchestrator/actions/detection.py b/aiopslab/orchestrator/actions/detection.py index ed2ffad3..3a3089b2 100644 --- a/aiopslab/orchestrator/actions/detection.py +++ b/aiopslab/orchestrator/actions/detection.py @@ -20,7 +20,7 @@ def submit(has_anomaly: str) -> SubmissionStatus: Submit if anomalies are detected to the orchestrator for evaluation. Args: - has_anomaly (str): Yes if anomalies are detected, No otherwise. + has_anomaly (str): "Yes" if anomalies are detected, "No" otherwise. Returns: SubmissionStatus: The status of the submission. diff --git a/aiopslab/orchestrator/tasks/analysis.py b/aiopslab/orchestrator/tasks/analysis.py index be0116f2..1186a61b 100644 --- a/aiopslab/orchestrator/tasks/analysis.py +++ b/aiopslab/orchestrator/tasks/analysis.py @@ -25,9 +25,9 @@ def __init__(self, app: Application): self.actions = AnalysisActions() self.task_desc = """\ - You are an expert DevOps engineer who has been tasked with doing root cause analysis in a deployed service. + You are an expert DevOps engineer assigned to do root cause analysis in a deployed service. - The service you are working with today is described below: + Service Details: {app_summary} You will begin by analyzing the service's state and telemetry, and then submit one of two possible solutions: @@ -51,16 +51,23 @@ def __init__(self, app: Application): self.instructions = """\ You will respond with one of the above APIs as your next action. Please respond in the following format in a markdown code block: - ``` - (, ...) - ``` + ```\n(, ...)\n``` - For example: - ``` - exec_shell("ls -l") # will list files in current directory - ``` + For instance, if you want to list files in current directory, your response must be exactly: + + ```\nexec_shell("ls -l")\n``` + + When submitting your analysis, use the following format: + + ```\nsubmit({"system_level": "your_system_level_analysis", "fault_type": "your_fault_type_analysis"})\n``` + + Replace "your_system_level_analysis" and "your_fault_type_analysis" with the actual analysis of the system level and fault type. + + Or, if no fault is detected, you should respond with: + + ```\nsubmit()\n``` - Please respond with only a single action per turn. + Please respond with only a single API call (a.k.a., action) per turn without any additional words, labels, or prefixes. """ def get_task_description(self): diff --git a/aiopslab/orchestrator/tasks/detection.py b/aiopslab/orchestrator/tasks/detection.py index 31a2d060..449a86cb 100644 --- a/aiopslab/orchestrator/tasks/detection.py +++ b/aiopslab/orchestrator/tasks/detection.py @@ -26,28 +26,33 @@ def __init__(self, app: Application): self.actions = DetectionActions() self.task_desc = """\ - You are an expert DevOps engineer who has been tasked with detecting anomalies in a deployed service. - - The service you are working with today is described below: + You are an expert DevOps engineer assigned to detect anomalies in a deployed service. + + Service Details: {app_summary} You will begin by analyzing the service's state and telemetry, and then submit your solution: - 1. str: `Yes` or `No` to indicate whether there are anomalies detected + 1. str: "Yes" or "No" to indicate whether there are anomalies detected """ self.instructions = """\ You will respond with one of the above APIs as your next action. Please respond in the following format in a markdown code block: - ``` - (, ...) - ``` + ```\n(, ...)\n``` + + For instance, if you want to list files in current directory, your response must be exactly: + + ```\nexec_shell("ls -l")\n``` + + If you decide that there are no anomalies: + + ```\nsubmit(\"No\")\n``` + + Or, if anomalies are found: - For example: - ``` - exec_shell("ls -l") # will list files in current directory - ``` + ```\nsubmit(\"Yes\")\n``` - Please respond with only a single action per turn. + Please respond with only a single API call (a.k.a., action) per turn without any additional words, labels, or prefixes. """ def get_task_description(self): diff --git a/aiopslab/orchestrator/tasks/localization.py b/aiopslab/orchestrator/tasks/localization.py index cce76149..6b3af40d 100644 --- a/aiopslab/orchestrator/tasks/localization.py +++ b/aiopslab/orchestrator/tasks/localization.py @@ -26,30 +26,37 @@ def __init__(self, app: Application): self.actions = LocalizationActions() self.task_desc = """\ - You are an expert DevOps engineer who has been tasked with localizing faults in a deployed service. + You are an expert DevOps engineer assigned to localize faults in a deployed service. - The service you are working with today is described below: + Service Details: {app_summary} You are requested to identify the service(s) where the root cause of the fault lies. You will begin by analyzing the service's state and telemetry, and then submit one of two possible solutions: 1. list[str]: list of faulty components (i.e., service names) - 2. str: `None` if no faults were detected + 2. list[]: an empty list if no faults were detected """ self.instructions = """\ You will respond with one of the above APIs as your next action. Please respond in the following format in a markdown code block: - ``` - (, ...) - ``` + ```\n(, ...)\n``` - For example: - ``` - exec_shell("ls -l") # will list files in current directory - ``` + For instance, if you want to list files in current directory, your response must be exactly: + + ```\nexec_shell("ls -l")\n``` - Please respond with only a single action per turn. + If there are faulty components to submit: + + ```\nsubmit([\"placeholderA\", \"placeholderB\"])\n``` + + (where "placeholderA" and "placeholderB" are placeholders; replace them with the faulty components in your environment) + + Or, if no faults are found: + + ```\nsubmit([])\n``` + + Please respond with only a single API call (a.k.a., action) per turn without any additional words, labels, or prefixes. """ def get_task_description(self): diff --git a/aiopslab/orchestrator/tasks/mitigation.py b/aiopslab/orchestrator/tasks/mitigation.py index 2f45e139..d15b62dd 100644 --- a/aiopslab/orchestrator/tasks/mitigation.py +++ b/aiopslab/orchestrator/tasks/mitigation.py @@ -25,9 +25,9 @@ def __init__(self, app: Application): self.actions = MitigationActions() self.task_desc = """\ - You are an expert DevOps engineer who has been tasked with mitigating anomalies in a deployed service. + You are an expert DevOps engineer assigned to mitigate anomalies in a deployed service. - The service you are working with today is described below: + Service Details: {app_summary} You will begin by analyzing the service's state and telemetry, and then submit a solution that mitigates any detected anomalies. @@ -37,16 +37,21 @@ def __init__(self, app: Application): self.instructions = """\ You will respond with one of the above APIs as your next action. Please respond in the following format in a markdown code block: - ``` - (, ...) - ``` + ```\n(, ...)\n``` - For example: - ``` - exec_shell("ls -l") # will list files in current directory - ``` + For instance, if you want to list files in current directory, your response must be exactly: + + ```\nexec_shell("ls -l")\n``` - Please respond with only a single action per turn. + Once your solution is complete and ready for evaluation, you must call: + + ```\nsubmit()\n``` + + Note: + - The submit() call for the mitigation task does not take any parameters. + - A submission via submit() is considered valid if it is made, though this does not necessarily indicate that your solution is correct. + + Please respond with only a single API call (a.k.a., action) per turn without any additional words, labels, or prefixes. """ def get_task_description(self): diff --git a/clients/utils/templates.py b/clients/utils/templates.py index db58b779..a2f6e25b 100644 --- a/clients/utils/templates.py +++ b/clients/utils/templates.py @@ -35,15 +35,15 @@ DOCS_SHELL_ONLY = """{prob_desc} You are provided with a direct API to a secure terminal to the service where you can run commands: - {shell_api} - Finally, you will submit your solution for this task using the following API: - {submit_api} +IMPORTANT: +1. The submit() call must strictly follow its defined parameter signature for this task. +2. Provide the call in a markdown code block. At each turn respond with: Action: From e14d4132dbdb07e53472f539445616858608efd2 Mon Sep 17 00:00:00 2001 From: Warren Yeh Date: Mon, 21 Apr 2025 18:20:47 +0800 Subject: [PATCH 049/100] Update README and llm.py to support .env file for API keys management --- README.md | 9 ++++++++- clients/utils/llm.py | 4 ++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1ee1764e..0283cd6a 100644 --- a/README.md +++ b/README.md @@ -103,10 +103,17 @@ python3 cli.py Run GPT-4 baseline agent: ```bash -export OPENAI_API_KEY= +# Create a .env file in the project root (if not exists) +echo "OPENAI_API_KEY=" > .env +# Add more API keys as needed: +# echo "QWEN_API_KEY=" >> .env +# echo "DEEPSEEK_API_KEY=" >> .env + python3 clients/gpt.py # you can also change the problem to solve in the main() function ``` +The clients will automatically load API keys from your .env file. + You can check the running status of the cluster using [k9s](https://k9scli.io/) or other cluster monitoring tools conveniently.

⚙️ Usage

diff --git a/clients/utils/llm.py b/clients/utils/llm.py index 1a156a36..a1fd1bdb 100644 --- a/clients/utils/llm.py +++ b/clients/utils/llm.py @@ -7,6 +7,10 @@ from openai import OpenAI from pathlib import Path import json +from dotenv import load_dotenv + +# Load environment variables from the .env file +load_dotenv() CACHE_DIR = Path("./cache_dir") CACHE_PATH = CACHE_DIR / "cache.json" From c52b81607040333f45fb322c0196f97d7db9d374 Mon Sep 17 00:00:00 2001 From: Warren Yeh Date: Mon, 21 Apr 2025 21:46:50 +0800 Subject: [PATCH 050/100] Add Weights & Biases integration for session logging and orchestrator initialization --- .gitignore | 3 +++ README.md | 10 ++++++++++ aiopslab/orchestrator/orchestrator.py | 5 ++++- aiopslab/session.py | 5 +++++ clients/gpt.py | 22 ++++++++++++++++++++-- 5 files changed, 42 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index ddac56ef..71e2721f 100644 --- a/.gitignore +++ b/.gitignore @@ -327,6 +327,9 @@ cython_debug/ # Visual Studio Code .vscode/ +# Weight & Biases +wandb/ + # Project specific cache_dir demos diff --git a/README.md b/README.md index 0283cd6a..7d70b17b 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,16 @@ The clients will automatically load API keys from your .env file. You can check the running status of the cluster using [k9s](https://k9scli.io/) or other cluster monitoring tools conveniently. +To browse your logged `session_id` values in the W&B app as a table: + +1. Make sure you have W&B installed and configured. +2. Run the following command to start the W&B server: + ```python + python3 clients/gpt.py --use_wandb + ``` +3. In the W&B web UI, open any run and click Tables → Add Query Panel. +4. In the key field, type `runs.summary` and click `Run`, then you will see the results displayed in a table format. +

⚙️ Usage

AIOpsLab can be used in the following ways: diff --git a/aiopslab/orchestrator/orchestrator.py b/aiopslab/orchestrator/orchestrator.py index afb90a0a..b74c56f9 100644 --- a/aiopslab/orchestrator/orchestrator.py +++ b/aiopslab/orchestrator/orchestrator.py @@ -18,7 +18,7 @@ class Orchestrator: - def __init__(self): + def __init__(self, use_wandb=False): self.agent = None self.session = None self.parser = ResponseParser() @@ -27,6 +27,7 @@ def __init__(self): self.execution_start_time = None self.execution_end_time = None self.kubectl = KubeCtl() + self.use_wandb = use_wandb def init_problem(self, problem_id: str): """Initialize a problem instance for the agent to solve. @@ -186,6 +187,8 @@ async def start_problem(self, max_steps: int): self.session.set_results(results) self.session.to_json() + if self.use_wandb: + self.session.to_wandb() with CriticalSection(): self.session.problem.recover_fault() diff --git a/aiopslab/session.py b/aiopslab/session.py index e1862ddc..dcdc09e4 100644 --- a/aiopslab/session.py +++ b/aiopslab/session.py @@ -6,6 +6,7 @@ import time import uuid import json +import wandb from pydantic import BaseModel from aiopslab.paths import RESULTS_DIR @@ -118,6 +119,10 @@ def to_json(self): with open(RESULTS_DIR / f"{self.session_id}_{self.start_time}.json", "w") as f: json.dump(self.to_dict(), f, indent=4) + + def to_wandb(self): + """Log the session to Weights & Biases.""" + wandb.log(self.to_dict()) def from_json(self, filename: str): """Load a session from a JSON file.""" diff --git a/clients/gpt.py b/clients/gpt.py index cad89299..c0fc3cbb 100644 --- a/clients/gpt.py +++ b/clients/gpt.py @@ -6,9 +6,10 @@ Code: https://openai.com/index/gpt-4-research/ Paper: https://arxiv.org/abs/2303.08774 """ - +import argparse import asyncio +import wandb from aiopslab.orchestrator import Orchestrator from clients.utils.llm import GPT4Turbo from clients.utils.templates import DOCS_SHELL_ONLY @@ -58,12 +59,29 @@ def _filter_dict(self, dictionary, filter_func): if __name__ == "__main__": + parser = argparse.ArgumentParser(description="OpenAI gpt client for AIOpsLab") + parser.add_argument( + "--use_wandb", + action="store_true", + default=False, + help="Enable Weights & Biases logging" + ) + args = parser.parse_args() + + if args.use_wandb: + # Initialize wandb run + wandb.init(project="AIOpsLab", entity="AIOpsLab") + agent = Agent() - orchestrator = Orchestrator() + orchestrator = Orchestrator(use_wandb=args.use_wandb) orchestrator.register_agent(agent, name="gpt-w-shell") pid = "misconfig_app_hotel_res-mitigation-1" problem_desc, instructs, apis = orchestrator.init_problem(pid) agent.init_context(problem_desc, instructs, apis) asyncio.run(orchestrator.start_problem(max_steps=10)) + + if args.use_wandb: + # Finish the wandb run + wandb.finish() From a862d39e8cd18c5a2d9aaaaf4dbda2dff81ee93f Mon Sep 17 00:00:00 2001 From: Warren Yeh Date: Tue, 22 Apr 2025 00:05:40 +0800 Subject: [PATCH 051/100] Refactor W&B integration to use environment variable for configuration and update README instructions --- README.md | 7 ++++--- clients/gpt.py | 22 ++++++++-------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 7d70b17b..31bcb678 100644 --- a/README.md +++ b/README.md @@ -119,9 +119,10 @@ You can check the running status of the cluster using [k9s](https://k9scli.io/) To browse your logged `session_id` values in the W&B app as a table: 1. Make sure you have W&B installed and configured. -2. Run the following command to start the W&B server: - ```python - python3 clients/gpt.py --use_wandb +2. Set the USE_WANDB environment variable: + ```bash + # Add to your .env file + echo "USE_WANDB=true" >> .env ``` 3. In the W&B web UI, open any run and click Tables → Add Query Panel. 4. In the key field, type `runs.summary` and click `Run`, then you will see the results displayed in a table format. diff --git a/clients/gpt.py b/clients/gpt.py index c0fc3cbb..f7359a97 100644 --- a/clients/gpt.py +++ b/clients/gpt.py @@ -6,7 +6,7 @@ Code: https://openai.com/index/gpt-4-research/ Paper: https://arxiv.org/abs/2303.08774 """ -import argparse +import os import asyncio import wandb @@ -59,22 +59,16 @@ def _filter_dict(self, dictionary, filter_func): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="OpenAI gpt client for AIOpsLab") - parser.add_argument( - "--use_wandb", - action="store_true", - default=False, - help="Enable Weights & Biases logging" - ) - args = parser.parse_args() - - if args.use_wandb: - # Initialize wandb run + # Load use_wandb from environment variable with a default of False + use_wandb = os.getenv("USE_WANDB", "false").lower() == "true" + + if use_wandb: + # Initialize wandb runn wandb.init(project="AIOpsLab", entity="AIOpsLab") agent = Agent() - orchestrator = Orchestrator(use_wandb=args.use_wandb) + orchestrator = Orchestrator(use_wandb=use_wandb) orchestrator.register_agent(agent, name="gpt-w-shell") pid = "misconfig_app_hotel_res-mitigation-1" @@ -82,6 +76,6 @@ def _filter_dict(self, dictionary, filter_func): agent.init_context(problem_desc, instructs, apis) asyncio.run(orchestrator.start_problem(max_steps=10)) - if args.use_wandb: + if use_wandb: # Finish the wandb run wandb.finish() From a3d740368dc0c2b204487ff8a02e13efbf6002bd Mon Sep 17 00:00:00 2001 From: Warren Yeh Date: Tue, 22 Apr 2025 00:15:34 +0800 Subject: [PATCH 052/100] Fix typo in W&B initialization comment for clarity --- clients/gpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clients/gpt.py b/clients/gpt.py index f7359a97..c63474ee 100644 --- a/clients/gpt.py +++ b/clients/gpt.py @@ -63,7 +63,7 @@ def _filter_dict(self, dictionary, filter_func): use_wandb = os.getenv("USE_WANDB", "false").lower() == "true" if use_wandb: - # Initialize wandb runn + # Initialize wandb running wandb.init(project="AIOpsLab", entity="AIOpsLab") agent = Agent() From cad460de9e2369e93572a7a994c78d3557811e93 Mon Sep 17 00:00:00 2001 From: Warren Yeh Date: Tue, 22 Apr 2025 01:02:09 +0800 Subject: [PATCH 053/100] Load environment variables from .env file to check the presence of "USE_WANDB" --- clients/gpt.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clients/gpt.py b/clients/gpt.py index c63474ee..1a61ef7b 100644 --- a/clients/gpt.py +++ b/clients/gpt.py @@ -13,7 +13,10 @@ from aiopslab.orchestrator import Orchestrator from clients.utils.llm import GPT4Turbo from clients.utils.templates import DOCS_SHELL_ONLY +from dotenv import load_dotenv +# Load environment variables from the .env file +load_dotenv() class Agent: def __init__(self): From d9e6aa4614c79b75384c43419cfacd23381b0c62 Mon Sep 17 00:00:00 2001 From: Warren Yeh Date: Tue, 22 Apr 2025 12:49:16 +0800 Subject: [PATCH 054/100] Refactor Orchestrator initialization to use environment variable for W&B configuration --- aiopslab/orchestrator/orchestrator.py | 5 +++-- clients/gpt.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/aiopslab/orchestrator/orchestrator.py b/aiopslab/orchestrator/orchestrator.py index b74c56f9..d3fbc455 100644 --- a/aiopslab/orchestrator/orchestrator.py +++ b/aiopslab/orchestrator/orchestrator.py @@ -15,10 +15,11 @@ import inspect import asyncio import atexit +import os class Orchestrator: - def __init__(self, use_wandb=False): + def __init__(self): self.agent = None self.session = None self.parser = ResponseParser() @@ -27,7 +28,7 @@ def __init__(self, use_wandb=False): self.execution_start_time = None self.execution_end_time = None self.kubectl = KubeCtl() - self.use_wandb = use_wandb + self.use_wandb = os.getenv("USE_WANDB", "false").lower() == "true" def init_problem(self, problem_id: str): """Initialize a problem instance for the agent to solve. diff --git a/clients/gpt.py b/clients/gpt.py index 1a61ef7b..b3248b11 100644 --- a/clients/gpt.py +++ b/clients/gpt.py @@ -71,7 +71,7 @@ def _filter_dict(self, dictionary, filter_func): agent = Agent() - orchestrator = Orchestrator(use_wandb=use_wandb) + orchestrator = Orchestrator() orchestrator.register_agent(agent, name="gpt-w-shell") pid = "misconfig_app_hotel_res-mitigation-1" From a1e0234b12724e5b3dbb2d25f1cecc4e103037b2 Mon Sep 17 00:00:00 2001 From: Warren Yeh Date: Tue, 22 Apr 2025 15:37:07 +0800 Subject: [PATCH 055/100] Add new clients and update README with usage instructions - Introduced DeepSeek and Qwen clients for AIOpsLab. - Updated README to include descriptions and setup instructions for vLLM client. - Refactored existing clients to use GPTClient instead of GPT4Turbo. - Added launch script for vLLM server. --- clients/README.md | 63 +- clients/deepseek.py | 86 + clients/flash.py | 6 +- clients/gpt.py | 4 +- clients/gpt_managed_identity.py | 6 +- clients/launch_vllm.sh | 15 + clients/qwen.py | 78 + clients/react.py | 4 +- clients/utils/llm.py | 139 +- clients/vllm.py | 91 + poetry.lock | 5283 +++++++++++++++++++++++++------ pyproject.toml | 4 + 12 files changed, 4837 insertions(+), 942 deletions(-) create mode 100644 clients/deepseek.py create mode 100755 clients/launch_vllm.sh create mode 100644 clients/qwen.py create mode 100644 clients/vllm.py diff --git a/clients/README.md b/clients/README.md index 29082d2f..65cf52c5 100644 --- a/clients/README.md +++ b/clients/README.md @@ -5,22 +5,75 @@ These clients are some baselines that we have implemented and evaluated to help ## Clients -- [GPT](/clients/gpt.py): A naive GPT4-based LLM agent with only shell access. +- [GPT](/clients/gpt.py): A naive GPT series LLM agent with only shell access. +- [DeepSeek](/clients/deepseek.py): A naive DeepSeek series LLM agent with only shell access. +- [Qwen](/clients/qwen.py): A naive Qwen series LLM agent with only shell access. +- [vLLM](/clients/vllm.py): A naive vLLM agent with any open source LLM deployed locally and only shell access. - [ReAct](/clients/react.py): A naive LLM agent that uses the ReAct framework. - [FLASH](/clients/flash.py): A naive LLM agent that uses status supervision and hindsight integration components to ensure the high reliability of workflow execution. +### Using the vLLM Client + +The vLLM client allows you to run local open-source models as an agent for AIOpsLab tasks. This approach is particularly useful when you want to: +- Use your own hardware for inference +- Experiment with different open-source models +- Work in environments without internet access to cloud LLM providers + +### Quick Setup Guide + +1. **Launch the vLLM server**: + ```bash + # Make the script executable + chmod +x ./clients/launch_vllm.sh + + # Run the script + ./clients/launch_vllm.sh + ``` + This will launch vLLM in the background using the default model (Qwen/Qwen2.5-3B-Instruct). + +2. **Check server status**: + ```bash + # View the log file to confirm the server is running + cat vllm_Qwen_Qwen2.5-3B-Instruct.log + ``` + +3. **Customize the model** (optional): + Edit `launch_vllm.sh` to change the model: + ```bash + # Open the file + nano ./clients/launch_vllm.sh + + # Change the MODEL variable to your preferred model + # Example: MODEL="mistralai/Mistral-7B-Instruct-v0.1" + ``` + +4. **Run the vLLM agent**: + ``` + python clients/vllm.py + ``` + +### Requirements + +- Poetry for dependency management +- Sufficient GPU resources for your chosen model +- The model must support the OpenAI chat completion API format + +### Advanced Configuration + +The vLLM client connects to `http://localhost:8000/v1` by default. If you've configured vLLM to use a different port or host, update the base_url in `clients/utils/llm.py` in the vLLMClient class. +