wanglei19991004
diff --git a/‎flagscale/run.py‎
Lines changed: 7 additions & 1 deletion b/‎flagscale/run.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎flagscale/runner/backend/backend_llama_cpp.py‎
Lines changed: 7 additions & 9 deletions b/‎flagscale/runner/backend/backend_llama_cpp.py‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎flagscale/runner/backend/backend_megatron.py‎
Lines changed: 7 additions & 12 deletions b/‎flagscale/runner/backend/backend_megatron.py‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎flagscale/runner/backend/backend_native_compress.py‎
Lines changed: 7 additions & 11 deletions b/‎flagscale/runner/backend/backend_native_compress.py‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎flagscale/runner/backend/backend_native_serve.py‎
Lines changed: 3 additions & 2 deletions b/‎flagscale/runner/backend/backend_native_serve.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎flagscale/runner/backend/backend_native_train.py‎
Lines changed: 7 additions & 12 deletions b/‎flagscale/runner/backend/backend_native_train.py‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎flagscale/runner/backend/backend_sglang.py‎
Lines changed: 3 additions & 2 deletions b/‎flagscale/runner/backend/backend_sglang.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎flagscale/runner/backend/backend_verl.py‎
Lines changed: 7 additions & 13 deletions b/‎flagscale/runner/backend/backend_verl.py‎
Lines changed: 7 additions & 13 deletions
diff --git a/‎flagscale/runner/backend/backend_vllm.py‎
Lines changed: 11 additions & 13 deletions b/‎flagscale/runner/backend/backend_vllm.py‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎flagscale/runner/launcher/launcher_cloud.py‎
Lines changed: 5 additions & 6 deletions b/‎flagscale/runner/launcher/launcher_cloud.py‎
Lines changed: 5 additions & 6 deletions
@@ -116,7 +116,13 @@ def execute_action(runner, action: str, task_type: str, config: DictConfig) -> N
     elif action == "dryrun":
         runner.run(dryrun=True)
     elif action == "test":
-        runner.run(with_test=True)
+        # Serve tasks are long-running daemons validated externally, so they
+        # must start in the background even during tests.  Other tasks
+        # (train, inference) run in the foreground so output streams directly.
+        if task_type == "serve":
+            runner.run()
+        else:
+            runner.run(background=False)
     elif action == "stop":
         runner.stop()
     elif action == "query":
 
@@ -136,7 +136,7 @@ def _prepare(self):
         logger.info("\n************** LlamaCpp Configuration **************")
         logger.info(f"\n{OmegaConf.to_yaml(self.config)}")
 
-    def generate_run_script(self, config, host, node_rank, cmd, background=True, with_test=False):
+    def generate_run_script(self, config, host, node_rank, cmd, background=False):
         logging_config = config.logging
 
         no_shared_fs = config.experiment.runner.get("no_shared_fs", False)
@@ -193,15 +193,13 @@ def generate_run_script(self, config, host, node_rank, cmd, background=True, wit
             f.write("\n")
             f.write("echo '=========== launch task (LlamaCpp) ==========='\n")
 
-            if with_test:
-                f.write(f'bash -c "$cmd; sync" >> {host_output_file} \n')
+            if background:
+                f.write(
+                    f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
+                )
             else:
-                if background:
-                    f.write(
-                        f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
-                    )
-                else:
-                    f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n')
+                f.write("set -o pipefail\n")
+                f.write(f'bash -c "$cmd; sync" 2>&1 | tee -a {host_output_file}\n')
 
             f.write("\n")
             f.flush()
 
@@ -36,8 +36,7 @@ def generate_run_script(
         host,
         node_rank,
         cmd,
-        background=True,
-        with_test=False,
+        background=False,
         pkg_dir=None,
         enable_monitoring=False,
     ):
@@ -108,17 +107,13 @@ def generate_run_script(
                 )
             f.write("\n")
 
-            if with_test:
-                f.write('bash -c "$cmd; sync" \n')
+            if background:
+                f.write(
+                    f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
+                )
             else:
-                # TODO: need a option to control whether to append or overwrite the output file
-                # Now, it always appends to the output file
-                if background:
-                    f.write(
-                        f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
-                    )
-                else:
-                    f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n')
+                f.write("set -o pipefail\n")
+                f.write(f'bash -c "$cmd; sync" 2>&1 | tee -a {host_output_file}\n')
             f.write("\n")
             f.flush()
             os.fsync(f.fileno())
 
@@ -71,7 +71,7 @@ def _prepare(self):
         logger.info("\n************** configuration **************")
         logger.info(f"\n{OmegaConf.to_yaml(self.config)}")
 
-    def generate_run_script(self, config, host, node_rank, cmd, background=True, with_test=False):
+    def generate_run_script(self, config, host, node_rank, cmd, background=False):
         system_config = config.compress.system
         logging_config = config.compress.system.logging
 
@@ -113,17 +113,13 @@ def generate_run_script(self, config, host, node_rank, cmd, background=True, wit
             f.write("\n")
             f.write(f'cmd="{cmd}"\n')
             f.write("\n")
-            if with_test:
-                f.write('bash -c "$cmd; sync" \n')
+            if background:
+                f.write(
+                    f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
+                )
             else:
-                # TODO: need a option to control whether to append or overwrite the output file
-                # Now, it always appends to the output file
-                if background:
-                    f.write(
-                        f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
-                    )
-                else:
-                    f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n')
+                f.write("set -o pipefail\n")
+                f.write(f'bash -c "$cmd; sync" 2>&1 | tee -a {host_output_file}\n')
             f.write("\n")
             f.flush()
             os.fsync(f.fileno())
 
@@ -147,7 +147,7 @@ def _prepare(self):
         logger.info("\n************** Ray Configuration **************")
         logger.info(f"\n{OmegaConf.to_yaml(self.config)}")
 
-    def generate_run_script(self, config, host, node_rank, cmd, background=True, with_test=False):
+    def generate_run_script(self, config, host, node_rank, cmd, background=False):
         nodes = config.get("nodes", None)
         logging_config = config.logging
 
@@ -350,7 +350,8 @@ def generate_run_script(self, config, host, node_rank, cmd, background=True, wit
                     f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
                 )
             else:
-                f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n')
+                f.write("set -o pipefail\n")
+                f.write(f'bash -c "$cmd; sync" 2>&1 | tee -a {host_output_file}\n')
 
             f.write("\n")
             f.flush()
 
@@ -33,8 +33,7 @@ def generate_run_script(
         host,
         node_rank,
         cmd,
-        background=True,
-        with_test=False,
+        background=False,
         pkg_dir=None,
         enable_monitoring=False,
     ):
@@ -105,17 +104,13 @@ def generate_run_script(
                 )
             f.write("\n")
 
-            if with_test:
-                f.write('bash -c "$cmd; sync" \n')
+            if background:
+                f.write(
+                    f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
+                )
             else:
-                # TODO: need a option to control whether to append or overwrite the output file
-                # Now, it always appends to the output file
-                if background:
-                    f.write(
-                        f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
-                    )
-                else:
-                    f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n')
+                f.write("set -o pipefail\n")
+                f.write(f'bash -c "$cmd; sync" 2>&1 | tee -a {host_output_file}\n')
             f.write("\n")
             f.flush()
             os.fsync(f.fileno())
 
@@ -149,7 +149,7 @@ def _prepare(self):
         logger.info("\n************** Sglang Configuration **************")
         logger.info(f"\n{OmegaConf.to_yaml(self.config)}")
 
-    def generate_run_script(self, config, host, node_rank, cmd, background=True, with_test=False):
+    def generate_run_script(self, config, host, node_rank, cmd, background=False):
         nodes = config.get("nodes", None)
         logging_config = config.logging
 
@@ -385,7 +385,8 @@ def generate_run_script(self, config, host, node_rank, cmd, background=True, wit
                     f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
                 )
             else:
-                f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n')
+                f.write("set -o pipefail\n")
+                f.write(f'bash -c "$cmd; sync" 2>&1 | tee -a {host_output_file}\n')
             f.write("\n")
             f.flush()
             os.fsync(f.fileno())
 
@@ -60,9 +60,7 @@ def _prepare(self):
         logger.info("\n************** configuration **************")
         logger.info(f"\n{OmegaConf.to_yaml(self.config)}")
 
-    def generate_run_script(
-        self, config, host, node_rank, cmd, background=True, with_test=False, resources=None
-    ):
+    def generate_run_script(self, config, host, node_rank, cmd, background=False, resources=None):
         system_config = config.system
         logging_config = config.system.logging
 
@@ -112,17 +110,13 @@ def generate_run_script(
             f.write("\n")
             f.write(f'cmd="{cmd}"\n')
             f.write("\n")
-            if with_test:
-                f.write(f'bash -c "$cmd; sync"  >> {host_output_file} \n')
+            if background:
+                f.write(
+                    f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
+                )
             else:
-                # TODO: need a option to control whether to append or overwrite the output file
-                # Now, it always appends to the output file
-                if background:
-                    f.write(
-                        f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
-                    )
-                else:
-                    f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n')
+                f.write("set -o pipefail\n")
+                f.write(f'bash -c "$cmd; sync" 2>&1 | tee -a {host_output_file}\n')
             f.write("\n")
             f.flush()
             os.fsync(f.fileno())
 
@@ -367,7 +367,7 @@ def _prepare(self):
         logger.info("\n************** configuration **************")
         logger.info(f"\n{OmegaConf.to_yaml(self.config)}")
 
-    def generate_run_script(self, config, host, node_rank, cmd, background=True, with_test=False):
+    def generate_run_script(self, config, host, node_rank, cmd, background=False):
         if self.task_type == "inference":
             logging_config = config.inference.logging
 
@@ -404,17 +404,13 @@ def generate_run_script(self, config, host, node_rank, cmd, background=True, wit
                 f.write("\n")
                 f.write(f'cmd="{cmd}"\n')
                 f.write("\n")
-                if with_test:
-                    f.write(f'bash -c "$cmd; sync"  >> {host_output_file} \n')
+                if background:
+                    f.write(
+                        f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
+                    )
                 else:
-                    # TODO: need a option to control whether to append or overwrite the output file
-                    # Now, it always appends to the output file
-                    if background:
-                        f.write(
-                            f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
-                        )
-                    else:
-                        f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n')
+                    f.write("set -o pipefail\n")
+                    f.write(f'bash -c "$cmd; sync" 2>&1 | tee -a {host_output_file}\n')
                 f.write("\n")
                 f.flush()
                 os.fsync(f.fileno())
@@ -595,7 +591,8 @@ def generate_run_script(self, config, host, node_rank, cmd, background=True, wit
                             f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
                         )
                     else:
-                        f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n')
+                        f.write("set -o pipefail\n")
+                        f.write(f'bash -c "$cmd; sync" 2>&1 | tee -a {host_output_file}\n')
                 f.write("\n")
                 f.flush()
                 os.fsync(f.fileno())
@@ -977,7 +974,8 @@ def generate_run_script(self, config, host, node_rank, cmd, background=True, wit
                         f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n'
                     )
                 else:
-                    f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n')
+                    f.write("set -o pipefail\n")
+                    f.write(f'bash -c "$cmd; sync" 2>&1 | tee -a {host_output_file}\n')
                 f.write("\n")
                 f.flush()
                 os.fsync(f.fileno())
 
@@ -29,23 +29,23 @@ def _run_each(
         nnodes,
         node_rank,
         nproc_per_node,
-        with_test=False,
+        background=True,
         dryrun=False,
     ):
         export_cmd = []
         for k, v in self.user_envs.items():
             export_cmd += [f"{k}={v}"]
 
-        cmd = shlex.join(export_cmd + ["python"] + [self.user_script] + self.user_args)
+        cmd = shlex.join([*export_cmd, "python", self.user_script, *self.user_args])
 
         host_run_script_file = self.backend.generate_run_script(
-            self.config, host, node_rank, cmd, background=True, with_test=with_test
+            self.config, host, node_rank, cmd, background=background
         )
 
         run_local_command(f"bash {host_run_script_file}", dryrun)
 
     def run(
-        self, with_test=False, dryrun=False, monitor=False, interval=10, enable_monitoring=None
+        self, background=True, dryrun=False, monitor=False, interval=10, enable_monitoring=None
     ):
         num_visible_devices = None
         visible_devices = self.user_envs.get("CUDA_VISIBLE_DEVICES", None)
@@ -67,7 +67,7 @@ def run(
             1,
             0,
             nproc_per_node,
-            with_test=with_test,
+            background=background,
             dryrun=dryrun,
         )
         self.host = available_addr
@@ -118,7 +118,6 @@ def _generate_query_script(self, host, node_rank):
     def _query_each(self, host, node_rank):
         "Query each node status."
         host_query_script_file = self._generate_query_script(host, node_rank)
-        logging_config = self.config.logging
         result = ""
         try:
             result = run_local_command(f"bash {host_query_script_file}", query=True)