sgl-project · zolinthecow · Oct 28, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/docs/frontend/frontend.md b/docs/frontend/frontend.md
@@ -233,6 +233,35 @@ def chat_example(s):
     s += sgl.assistant_end()
 ```
 
+### Debug Studio
+
+The frontend also provides a debug studio to view what exactly is getting passed into the runtime endpoint's generation API.
+To use it, first start the debug server:
+
+```bash
+python -m sglang.launch_debug_server
-python -m sglang.launch_debug_server
+python -m sglang.lang.launch_debug_server
-python -m sglang.launch_debug_server
+python -m sglang.lang.launch_debug_server
+```
+
+It will start a debug server on port 56765. Then, add a debug region to an `sgl.function`:
+
+```python
+@sgl.function
+def text_qa(s, question):
+    s.begin_debug_region("TEXT_QA")
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n")
+
+state = text_qa.run(
+    question="What is the capital of France?",
+    temperature=0.1,
+    stream=True
+)
+```
+
+When you navigate to `http://localhost:56765` (if you're on a remote server, ssh forward the port), you should see a web app with the prompt and response.
+
+<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/debug_studio_example.png" alt="prompt_studio_demo" margin="10px">
+
 ### Tips and Implementation Details
 - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
 - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -28,6 +28,8 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]
 
+studio = ["enochian-studio>=0.0.3.post10"]
+
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
@@ -39,9 +41,9 @@ test = [
     "accelerate",
     "peft",
 ]
-all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
-all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
-all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
+all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[studio]"]
+all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[studio]"]
+all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]", "sglang[studio]"]
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]
 dev_xpu = ["sglang[all_xpu]", "sglang[test]"]

diff --git a/python/sglang/README.md b/python/sglang/README.md
@@ -9,4 +9,5 @@
 - `bench_serving.py`: Benchmark online serving with dynamic requests.
 - `global_config.py`: The global configs and constants.
 - `launch_server.py`: The entry point for launching the local server.
+- `launch_debug_server.py`: The entry point for launching the debug server + web app
 - `utils.py`: Common utilities.
@@ -1,3 +1,5 @@
+import uuid
+from datetime import datetime
 from typing import List, Optional, Union
 
 import numpy as np
@@ -42,6 +44,20 @@ def generate(
         else:
             system = ""
 
+        debug_request_id = str(uuid.uuid4())
+        s.log_debug(
+            [
+                {
+                    "id": debug_request_id,
+                    "requestPrompt": str(
+                        [{"role": "system", "content": system}] + messages
+                    ),
+                    "requestTimestamp": datetime.now().isoformat(),
+                    "requestMetadata": sampling_params.to_anthropic_kwargs(),
+                }
+            ]
+        )
+
         ret = self.client.messages.create(
             model=self.model_name,
             system=system,
@@ -50,6 +66,17 @@ def generate(
         )
         comp = ret.content[0].text
 
+        s.log_debug(
+            [
+                {
+                    "id": debug_request_id,
+                    "responseContent": comp,
+                    "responseTimestamp": datetime.now().isoformat(),
+                    "responseMetadata": ret.to_json(),
+                }
+            ]
+        )
+
         return comp, {}
 
     def generate_stream(
@@ -67,6 +94,20 @@ def generate_stream(
         else:
             system = ""
 
+        debug_request_id = str(uuid.uuid4())
+        debug_obj = s.log_debug(
+            [
+                {
+                    "id": debug_request_id,
+                    "requestPrompt": str(
+                        [{"role": "system", "content": system}] + messages
+                    ),
+                    "requestTimestamp": datetime.now().isoformat(),
+                    "requestMetadata": sampling_params.to_anthropic_kwargs(),
+                }
+            ]
+        )
+
         with self.client.messages.stream(
             model=self.model_name,
             system=system,
@@ -75,3 +116,15 @@ def generate_stream(
         ) as stream:
             for text in stream.text_stream:
                 yield text, {}
+        final_message = stream.get_final_message()
+        final_message_json = final_message.to_json()
+        s.log_debug(
+            [
+                {
+                    "id": debug_request_id,
+                    "responseContent": final_message.content[0].text,
+                    "responseTimestamp": datetime.now().isoformat(),
+                    "responseMetadata": final_message_json,
+                }
+            ]
+        )
@@ -1,3 +1,5 @@
+import uuid
+from datetime import datetime
 from typing import Mapping, Optional
 
 from sglang.lang.backend.base_backend import BaseBackend
@@ -57,6 +59,21 @@ def generate(
         else:
             messages = [{"role": "user", "content": s.text_}]
 
+        debug_request_id = str(uuid.uuid4())
+        s.log_debug(
+            [
+                {
+                    "id": debug_request_id,
+                    "requestPrompt": str(messages),
+                    "requestTimestamp": datetime.now().isoformat(),
+                    "requestMetadata": {
+                        **self.client_params,
+                        **sampling_params.to_litellm_kwargs(),
+                    },
+                }
+            ]
+        )
+
         ret = litellm.completion(
             model=self.model_name,
             messages=messages,
@@ -65,6 +82,17 @@ def generate(
         )
         comp = ret.choices[0].message.content
 
+        s.log_debug(
+            [
+                {
+                    "id": debug_request_id,
+                    "responseContent": comp,
+                    "responseTimestamp": datetime.now().isoformat(),
+                    "responseMetadata": ret.to_json(),
+                }
+            ]
+        )
+
         return comp, {}
 
     def generate_stream(
@@ -77,14 +105,43 @@ def generate_stream(
         else:
             messages = [{"role": "user", "content": s.text_}]
 
+        debug_request_id = str(uuid.uuid4())
+        s.log_debug(
+            [
+                {
+                    "id": debug_request_id,
+                    "requestPrompt": str(messages),
+                    "requestTimestamp": datetime.now().isoformat(),
+                    "requestMetadata": {
+                        **self.client_params,
+                        **sampling_params.to_litellm_kwargs(),
+                    },
+                }
+            ]
+        )
+
         ret = litellm.completion(
             model=self.model_name,
             messages=messages,
             stream=True,
             **self.client_params,
             **sampling_params.to_litellm_kwargs(),
         )
+
+        full_text = ""
         for chunk in ret:
             text = chunk.choices[0].delta.content
             if text is not None:
+                full_text += text
                 yield text, {}
+
+        s.log_debug(
+            [
+                {
+                    "id": debug_request_id,
+                    "responseContent": full_text,
+                    "responseTimestamp": datetime.now().isoformat(),
+                    "responseMetadata": {},
+                }
+            ]
+        )