sgl-project · zhaochenyang20 · Nov 18, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -263,6 +263,24 @@ jobs:
           cd test/srt
           python3 test_moe_eval_accuracy_large.py
 
+  weight-update-test-2-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_dependency.sh
+
+      - name: Test weight update (TP, DP = 2,1  or 1,2)
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_update_parameter_from_distributed.py
+          python3 test_get_parameter_by_name.py
+
   finish:
     needs: [
       unit-test-frontend, unit-test-backend-part-1, unit-test-backend-part-2, unit-test-backend-part-3, unit-test-backend-part-4,

diff --git a/3rdparty/amd/profiling/PROFILING.md b/3rdparty/amd/profiling/PROFILING.md
@@ -421,5 +421,5 @@ index 62d1ff9..6ecd78c 100644
 3. Modify the included server.sh by removing "loadTracer.sh" before python command and launch script ./server.sh in one terminal inside the docker container.
 
 4. Similar to step 6 in RPD profiling section, but remove the last 2 lines in client.sh, which converted rpd file into csv and json files. Run modified client.sh for PyTorch profiling.
-=======
+-------
 - [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
@@ -14,7 +14,7 @@
     "- `/health`\n",
     "- `/health_generate`\n",
     "- `/flush_cache`\n",
-    "- `/update_weights`\n",
+    "- `/update_weights_from_disk`\n",
     "- `/encode`(embedding model)\n",
     "- `/classify`(reward model)\n",
     "\n",
@@ -98,7 +98,7 @@
     "print_highlight(response_json)\n",
     "assert response_json[\"model_path\"] == \"meta-llama/Llama-3.2-1B-Instruct\"\n",
     "assert response_json[\"is_generation\"] is True\n",
-    "assert response_json.keys() == {\"model_path\", \"is_generation\"}"
+    "assert list(response_json.keys()) == [\"model_path\", \"tokenizer_path\", \"is_generation\"]"
    ]
   },
   {
@@ -144,8 +144,7 @@
    "source": [
     "url = \"http://localhost:30010/health_generate\"\n",
     "\n",
-    "response = requests.get(url)\n",
-    "print_highlight(response.text)"
+    "response = requests.get(url)"
    ]
   },
   {
@@ -156,8 +155,7 @@
    "source": [
     "url = \"http://localhost:30010/health\"\n",
     "\n",
-    "response = requests.get(url)\n",
-    "print_highlight(response.text)"
+    "response = requests.get(url)"
    ]
   },
   {
@@ -187,9 +185,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Update Weights\n",
+    "## Update Weights From Disk\n",
     "\n",
-    "Update model weights without restarting the server. Use for continuous evaluation during training. Only applicable for models with the same architecture and parameter size."
+    "Update model weights from disk without restarting the server. Use for continuous evaluation during training. Only applicable for models with the same architecture and parameter size."
    ]
   },
   {
@@ -200,7 +198,7 @@
    "source": [
     "# successful update with same architecture and size\n",
     "\n",
-    "url = \"http://localhost:30010/update_weights\"\n",
+    "url = \"http://localhost:30010/update_weights_from_disk\"\n",
     "data = {\"model_path\": \"meta-llama/Llama-3.2-1B\"}\n",
     "\n",
     "response = requests.post(url, json=data)\n",
@@ -218,7 +216,7 @@
    "source": [
     "# failed update with different parameter size\n",
     "\n",
-    "url = \"http://localhost:30010/update_weights\"\n",
+    "url = \"http://localhost:30010/update_weights_from_disk\"\n",
     "data = {\"model_path\": \"meta-llama/Llama-3.2-3B\"}\n",
     "\n",
     "response = requests.post(url, json=data)\n",
@@ -340,7 +338,19 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
+      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
+      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
    "source": [
     "terminate_process(reward_process)"
    ]

@@ -352,19 +352,66 @@ class FlushCacheReq:
 
 
 @dataclass
-class UpdateWeightReqInput:
+class UpdateWeightFromDistReqInput:
     # The model path with the new weights
     model_path: str
     # The format to load the weights
     load_format: Optional[str] = None
 
 
 @dataclass
-class UpdateWeightReqOutput:
+class UpdateWeightFromDistReqOutput:
     success: bool
     message: str
 
 
+@dataclass
+class UpdateParameterFromDistributedReqInput:
+    name: str
+    dtype: str
+    shape: List[int]
+    empty_cache: bool
+
+
+@dataclass
+class UpdateParameterFromDistributedReqOutput:
+    success: bool
+    message: str
+
+
+@dataclass
+class InitParameterUpdateGroupReqInput:
+    # The master address
+    master_address: str
+    # The master port
+    master_port: int
+    # The rank offset
+    rank_offset: int
+    # The world size
+    world_size: int
+    # The group name
+    group_name: str
+    # The backend
+    backend: str = "nccl"
+
+
+@dataclass
+class InitParameterUpdateGroupReqOutput:
+    success: bool
+    message: str
+
+
+@dataclass
+class GetParameterByNameReqInput:
+    name: str
+    truncate_size: int = 100
+
+
+@dataclass
+class GetParameterByNameReqOutput:
+    parameter: list
+
+
 @dataclass
 class AbortReq:
     # The request id

@@ -38,13 +38,19 @@
     FlushCacheReq,
     GetMemPoolSizeReq,
     GetMemPoolSizeReqOutput,
+    GetParameterByNameReqInput,
+    GetParameterByNameReqOutput,
+    InitParameterUpdateGroupReqInput,
+    InitParameterUpdateGroupReqOutput,
     OpenSessionReqInput,
     OpenSessionReqOutput,
     ProfileReq,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
-    UpdateWeightReqInput,
-    UpdateWeightReqOutput,
+    UpdateParameterFromDistributedReqInput,
+    UpdateParameterFromDistributedReqOutput,
+    UpdateWeightFromDistReqInput,
+    UpdateWeightFromDistReqOutput,
 )
 from sglang.srt.managers.schedule_batch import (
     FINISH_ABORT,
@@ -500,10 +506,25 @@ def process_input_requests(self, recv_reqs: List):
                 self.flush_cache()
             elif isinstance(recv_req, AbortReq):
                 self.abort_request(recv_req)
-            elif isinstance(recv_req, UpdateWeightReqInput):
-                success, message = self.update_weights(recv_req)
+            elif isinstance(recv_req, UpdateWeightFromDistReqInput):
+                success, message = self.update_weights_from_disk(recv_req)
                 self.send_to_tokenizer.send_pyobj(
-                    UpdateWeightReqOutput(success, message)
+                    UpdateWeightFromDistReqOutput(success, message)
+                )
+            elif isinstance(recv_req, GetParameterByNameReqInput):
+                parameter = self.get_weights_by_parameter_name(recv_req)
+                self.send_to_tokenizer.send_pyobj(
+                    GetParameterByNameReqOutput(parameter)
+                )
+            elif isinstance(recv_req, InitParameterUpdateGroupReqInput):
+                success, message = self.init_parameter_update_group(recv_req)
+                self.send_to_tokenizer.send_pyobj(
+                    InitParameterUpdateGroupReqOutput(success, message)
+                )
+            elif isinstance(recv_req, UpdateParameterFromDistributedReqInput):
+                success, message = self.update_parameter_from_distributed(recv_req)
+                self.send_to_tokenizer.send_pyobj(
+                    UpdateParameterFromDistributedReqOutput(success, message)
                 )
             elif isinstance(recv_req, ProfileReq):
                 if recv_req == ProfileReq.START_PROFILE:
@@ -1353,16 +1374,37 @@ def abort_request(self, recv_req: AbortReq):
                     self.tree_cache.cache_finished_req(req)
                     break
 
-    def update_weights(self, recv_req: UpdateWeightReqInput):
+    def update_weights_from_disk(self, recv_req: UpdateWeightFromDistReqInput):
         """In-place update of the weights."""
-        success, message = self.tp_worker.update_weights(recv_req)
+        success, message = self.tp_worker.update_weights_from_disk(recv_req)
         if success:
             flash_cache_success = self.flush_cache()
             assert flash_cache_success, "Cache flush failed after updating weights"
         else:
             logger.error(message)
         return success, message
 
+    def init_parameter_update_group(self, recv_req: InitParameterUpdateGroupReqInput):
+        """Initialize the online model parameter update group."""
+        success, message = self.tp_worker.init_parameter_update_group(recv_req)
+        return success, message
+
+    def update_parameter_from_distributed(
+        self, recv_req: UpdateParameterFromDistributedReqInput
+    ):
+        """Update the online model parameter."""
+        success, message = self.tp_worker.update_parameter_from_distributed(recv_req)
+        if success:
+            flash_cache_success = self.flush_cache()
+            assert flash_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
+        return success, message
+
+    def get_weights_by_parameter_name(self, recv_req: GetParameterByNameReqInput):
+        parameter = self.tp_worker.get_weights_by_parameter_name(recv_req)
+        return parameter
+
     def start_profile(self) -> None:
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")