Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability for clients to send error log to server #3057

Merged
merged 11 commits into from
Nov 21, 2024
2 changes: 1 addition & 1 deletion docs/resources/log.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ args=(sys.stdout,)
class=FileHandler
level=ERROR
formatter=fullFormatter
args=('error.log', 'a')
args=('error_log.txt', 'a')

[formatter_fullFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
1 change: 1 addition & 0 deletions nvflare/apis/fl_constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ class WorkspaceConstants:
DEFAULT_LOGGING_CONFIG = LOGGING_CONFIG + ".default"
AUDIT_LOG = "audit.log"
LOG_FILE_NAME = "log.txt"
ERROR_LOG_FILE_NAME = "error_log.txt"
STATS_POOL_SUMMARY_FILE_NAME = "stats_pool_summary.json"
STATS_POOL_RECORDS_FILE_NAME = "stats_pool_records.csv"

Expand Down
3 changes: 3 additions & 0 deletions nvflare/apis/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def get_app_dir(self, job_id: str) -> str:
def get_app_log_file_path(self, job_id: str) -> str:
return os.path.join(self.get_run_dir(job_id), WorkspaceConstants.LOG_FILE_NAME)

def get_app_error_log_file_path(self, job_id: str) -> str:
return os.path.join(self.get_run_dir(job_id), WorkspaceConstants.ERROR_LOG_FILE_NAME)

def get_app_config_dir(self, job_id: str) -> str:
return os.path.join(self.get_app_dir(job_id), self.config_folder)

Expand Down
2 changes: 1 addition & 1 deletion nvflare/private/fed/app/simulator/log.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ args=(sys.stdout,)
class=FileHandler
level=ERROR
formatter=fullFormatter
args=('error.log', 'a')
args=('error_log.txt', 'a')

[formatter_fullFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
12 changes: 10 additions & 2 deletions nvflare/private/fed/client/client_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def start_app(

thread = threading.Thread(
target=self._wait_child_process_finish,
args=(client, job_id, allocated_resource, token, resource_manager, args.workspace),
args=(client, job_id, allocated_resource, token, resource_manager, args.workspace, fl_ctx),
)
thread.start()

Expand Down Expand Up @@ -384,7 +384,9 @@ def abort_task(self, job_id):
)
self.logger.debug("abort_task sent")

def _wait_child_process_finish(self, client, job_id, allocated_resource, token, resource_manager, workspace):
def _wait_child_process_finish(
self, client, job_id, allocated_resource, token, resource_manager, workspace, fl_ctx
):
self.logger.info(f"run ({job_id}): waiting for child worker process to finish.")
job_handle = self.run_processes.get(job_id, {}).get(RunProcessKey.JOB_HANDLE)
if job_handle:
Expand All @@ -393,6 +395,7 @@ def _wait_child_process_finish(self, client, job_id, allocated_resource, token,
return_code = get_return_code(job_handle, job_id, workspace, self.logger)

self.logger.info(f"run ({job_id}): child worker process finished with RC {return_code}")

if return_code in [ProcessExitCode.UNSAFE_COMPONENT, ProcessExitCode.CONFIG_ERROR]:
request = new_cell_message(
headers={},
Expand All @@ -419,6 +422,11 @@ def _wait_child_process_finish(self, client, job_id, allocated_resource, token,
self.run_processes.pop(job_id, None)
self.logger.debug(f"run ({job_id}): child worker resources freed.")

engine = fl_ctx.get_engine()
fl_ctx.set_prop(FLContextKey.CURRENT_JOB_ID, job_id, private=True, sticky=False)
fl_ctx.set_prop(FLContextKey.CLIENT_NAME, client.client_name, private=True, sticky=False)
engine.fire_event(EventType.JOB_COMPLETED, fl_ctx)

def get_status(self, job_id):
process_status = self.run_processes.get(job_id, {}).get(RunProcessKey.STATUS, ClientStatus.STOPPED)
return process_status
Expand Down
4 changes: 2 additions & 2 deletions nvflare/private/fed/utils/fed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def add_logfile_handler(log_file: str):
The purpose for this is to handle dynamic log file locations.
If a handler named errorFileHandler is found, it will be used as a template to
create a new handler for writing to the error.log file at the same directory as log_file.
create a new handler for writing to the error log file at the same directory as log_file.
The original errorFileHandler will be removed and replaced by the new handler.
Each log file will be rotated when it reaches 20MB.
Expand All @@ -90,7 +90,7 @@ def add_logfile_handler(log_file: str):
if not configured_error_handler:
return

error_log_file = os.path.join(os.path.dirname(log_file), "error.log")
error_log_file = os.path.join(os.path.dirname(log_file), WorkspaceConstants.ERROR_LOG_FILE_NAME)
error_file_handler = RotatingFileHandler(error_log_file, maxBytes=20 * 1024 * 1024, backupCount=10)
error_file_handler.setLevel(configured_error_handler.level)
error_file_handler.setFormatter(configured_error_handler.formatter)
Expand Down
Loading