Skip to content

Commit 7062fad

Browse files
authored
Merge pull request #276 from zillow/tz/AIP-7487-join-memory
AIP-7487 join memory fix w/ METAFLOW_S3OP_NUM_WORKERS=5
2 parents b866f3c + fc3565e commit 7062fad

File tree

4 files changed

+14
-1
lines changed

4 files changed

+14
-1
lines changed

metaflow/datatools/s3op.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from metaflow.multicore_utils import parallel_map
3636
from metaflow.datatools.s3util import aws_retry, read_in_chunks, get_timestamp
3737

38-
NUM_WORKERS_DEFAULT = 64
38+
NUM_WORKERS_DEFAULT = int(os.environ.get("METAFLOW_S3OP_NUM_WORKERS", 64))
3939

4040
DOWNLOAD_FILE_THRESHOLD = 2 * TransferConfig().multipart_threshold
4141
DOWNLOAD_MAX_CHUNK = 2 * 1024 * 1024 * 1024 - 1

metaflow/plugins/aip/aip.py

+2
Original file line numberDiff line numberDiff line change
@@ -1488,6 +1488,8 @@ def _create_metaflow_step_op(
14881488
metaflow_execution_cmd += f" --namespace {flow_variables.namespace}"
14891489
if step_variables.is_split_index:
14901490
metaflow_execution_cmd += " --is_split_index"
1491+
if node.type == "join":
1492+
metaflow_execution_cmd += " --is-join-step"
14911493

14921494
metaflow_execution_cmd += ' --preceding_component_outputs_dict "'
14931495
for key in preceding_component_outputs_dict:

metaflow/plugins/aip/aip_constants.py

+2
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,5 @@
2828
STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
2929

3030
AIP_CLI_DEFAULT_RETRY = 3
31+
32+
AIP_JOIN_METAFLOW_S3OP_NUM_WORKERS = 5

metaflow/plugins/aip/aip_metaflow_step.py

+9
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
STDOUT_PATH,
2323
STEP_ENVIRONMENT_VARIABLES,
2424
TASK_ID_ENV_NAME,
25+
AIP_JOIN_METAFLOW_S3OP_NUM_WORKERS,
2526
)
2627

2728
from ... import R
@@ -282,6 +283,7 @@ def _command(
282283
@click.option("--user_code_retries", type=int)
283284
@click.option("--workflow_name")
284285
@click.option("--is-interruptible/--not-interruptible", default=False)
286+
@click.option("--is-join-step", is_flag=True, default=False)
285287
def aip_metaflow_step(
286288
volume_dir: str,
287289
environment: str,
@@ -306,6 +308,7 @@ def aip_metaflow_step(
306308
user_code_retries: int,
307309
workflow_name: str,
308310
is_interruptible: bool,
311+
is_join_step: bool,
309312
) -> None:
310313
"""
311314
(1) Renders and runs the Metaflow package_commands and Metaflow step
@@ -373,6 +376,12 @@ def aip_metaflow_step(
373376
):
374377
metaflow_configs_new["METAFLOW_USER"] = "aip-user"
375378

379+
if is_join_step and "METAFLOW_S3OP_NUM_WORKERS" not in os.environ:
380+
# AIP-7487: Metaflow joins steps require lots of memory
381+
os.environ["METAFLOW_S3OP_NUM_WORKERS"] = str(
382+
AIP_JOIN_METAFLOW_S3OP_NUM_WORKERS
383+
)
384+
376385
env: Dict[str, str] = {
377386
**os.environ,
378387
**metaflow_configs_new,

0 commit comments

Comments
 (0)