Skip to content

Commit

Permalink
Add ability for clients to send error log to server (#3057)
Browse files Browse the repository at this point in the history
* have client send error log to server

* rename file and fix ci

* fix ci

* revise PR

* fix formatting

---------

Co-authored-by: Yuan-Ting Hsieh (謝沅廷) <[email protected]>
  • Loading branch information
nvkevlu and YuanTingHsieh authored Nov 21, 2024
1 parent 1a17390 commit 82fe663
Show file tree
Hide file tree
Showing 6 changed files with 18 additions and 6 deletions.
2 changes: 1 addition & 1 deletion docs/resources/log.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ args=(sys.stdout,)
class=FileHandler
level=ERROR
formatter=fullFormatter
args=('error.log', 'a')
args=('error_log.txt', 'a')

[formatter_fullFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
1 change: 1 addition & 0 deletions nvflare/apis/fl_constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ class WorkspaceConstants:
DEFAULT_LOGGING_CONFIG = LOGGING_CONFIG + ".default"
AUDIT_LOG = "audit.log"
LOG_FILE_NAME = "log.txt"
ERROR_LOG_FILE_NAME = "error_log.txt"
STATS_POOL_SUMMARY_FILE_NAME = "stats_pool_summary.json"
STATS_POOL_RECORDS_FILE_NAME = "stats_pool_records.csv"

Expand Down
3 changes: 3 additions & 0 deletions nvflare/apis/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def get_app_dir(self, job_id: str) -> str:
def get_app_log_file_path(self, job_id: str) -> str:
return os.path.join(self.get_run_dir(job_id), WorkspaceConstants.LOG_FILE_NAME)

def get_app_error_log_file_path(self, job_id: str) -> str:
return os.path.join(self.get_run_dir(job_id), WorkspaceConstants.ERROR_LOG_FILE_NAME)

def get_app_config_dir(self, job_id: str) -> str:
return os.path.join(self.get_app_dir(job_id), self.config_folder)

Expand Down
2 changes: 1 addition & 1 deletion nvflare/private/fed/app/simulator/log.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ args=(sys.stdout,)
class=FileHandler
level=ERROR
formatter=fullFormatter
args=('error.log', 'a')
args=('error_log.txt', 'a')

[formatter_fullFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
12 changes: 10 additions & 2 deletions nvflare/private/fed/client/client_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def start_app(

thread = threading.Thread(
target=self._wait_child_process_finish,
args=(client, job_id, allocated_resource, token, resource_manager, args.workspace),
args=(client, job_id, allocated_resource, token, resource_manager, args.workspace, fl_ctx),
)
thread.start()

Expand Down Expand Up @@ -384,7 +384,9 @@ def abort_task(self, job_id):
)
self.logger.debug("abort_task sent")

def _wait_child_process_finish(self, client, job_id, allocated_resource, token, resource_manager, workspace):
def _wait_child_process_finish(
self, client, job_id, allocated_resource, token, resource_manager, workspace, fl_ctx
):
self.logger.info(f"run ({job_id}): waiting for child worker process to finish.")
job_handle = self.run_processes.get(job_id, {}).get(RunProcessKey.JOB_HANDLE)
if job_handle:
Expand All @@ -393,6 +395,7 @@ def _wait_child_process_finish(self, client, job_id, allocated_resource, token,
return_code = get_return_code(job_handle, job_id, workspace, self.logger)

self.logger.info(f"run ({job_id}): child worker process finished with RC {return_code}")

if return_code in [ProcessExitCode.UNSAFE_COMPONENT, ProcessExitCode.CONFIG_ERROR]:
request = new_cell_message(
headers={},
Expand All @@ -419,6 +422,11 @@ def _wait_child_process_finish(self, client, job_id, allocated_resource, token,
self.run_processes.pop(job_id, None)
self.logger.debug(f"run ({job_id}): child worker resources freed.")

engine = fl_ctx.get_engine()
fl_ctx.set_prop(FLContextKey.CURRENT_JOB_ID, job_id, private=True, sticky=False)
fl_ctx.set_prop(FLContextKey.CLIENT_NAME, client.client_name, private=True, sticky=False)
engine.fire_event(EventType.JOB_COMPLETED, fl_ctx)

def get_status(self, job_id):
process_status = self.run_processes.get(job_id, {}).get(RunProcessKey.STATUS, ClientStatus.STOPPED)
return process_status
Expand Down
4 changes: 2 additions & 2 deletions nvflare/private/fed/utils/fed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def add_logfile_handler(log_file: str):
The purpose for this is to handle dynamic log file locations.
If a handler named errorFileHandler is found, it will be used as a template to
create a new handler for writing to the error.log file at the same directory as log_file.
create a new handler for writing to the error log file at the same directory as log_file.
The original errorFileHandler will be removed and replaced by the new handler.
Each log file will be rotated when it reaches 20MB.
Expand All @@ -90,7 +90,7 @@ def add_logfile_handler(log_file: str):
if not configured_error_handler:
return

error_log_file = os.path.join(os.path.dirname(log_file), "error.log")
error_log_file = os.path.join(os.path.dirname(log_file), WorkspaceConstants.ERROR_LOG_FILE_NAME)
error_file_handler = RotatingFileHandler(error_log_file, maxBytes=20 * 1024 * 1024, backupCount=10)
error_file_handler.setLevel(configured_error_handler.level)
error_file_handler.setFormatter(configured_error_handler.formatter)
Expand Down

0 comments on commit 82fe663

Please sign in to comment.