From 8d6a356369c154c187d90aef55c6da5158166d3b Mon Sep 17 00:00:00 2001 From: smilesun Date: Fri, 5 Jul 2024 17:06:22 +0200 Subject: [PATCH 1/7] use commandline to specify enviornemnt variable --- run_benchmark_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_benchmark_slurm.sh b/run_benchmark_slurm.sh index 72b8bbd83..36aeb7c8d 100755 --- a/run_benchmark_slurm.sh +++ b/run_benchmark_slurm.sh @@ -32,4 +32,4 @@ echo "Number of GPUs: $NUMBER_GPUS" echo "Results will be stored in: $results_dir" # Helmholtz -snakemake --profile "examples/yaml/slurm" --config yaml_file="$CONFIGFILE" --keep-going --keep-incomplete --notemp --cores 3 -s "domainlab/exp_protocol/benchmark.smk" --configfile "$CONFIGFILE" --config output_dir="$results_dir" 2>&1 | tee "$logfile" \ No newline at end of file +snakemake --config logdir="zoutput/benchmark/logs" --profile "examples/yaml/slurm" --config yaml_file="$CONFIGFILE" --keep-going --keep-incomplete --notemp --cores 3 -s "domainlab/exp_protocol/benchmark.smk" --configfile "$CONFIGFILE" --config output_dir="$results_dir" 2>&1 | tee "$logfile" From 8182f5b4c9d7e4b1eb627062c2ffe5fb1450121e Mon Sep 17 00:00:00 2001 From: smilesun Date: Fri, 5 Jul 2024 17:07:43 +0200 Subject: [PATCH 2/7] {log_dir} --- examples/yaml/slurm/config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/yaml/slurm/config.yaml b/examples/yaml/slurm/config.yaml index e180df74d..fb229c922 100644 --- a/examples/yaml/slurm/config.yaml +++ b/examples/yaml/slurm/config.yaml @@ -1,6 +1,6 @@ # This yaml file has been adapted from https://github.com/jdblischak/smk-simple-slurm cluster: - mkdir -p zoutput/slurm_logs/{rule} && + mkdir -p zoutput/{log_dir}/{rule} && sbatch --partition=gpu_p --qos=gpu_normal @@ -10,8 +10,8 @@ cluster: -c 2 --mem=160G --job-name=smk-{rule}-{wildcards} - --output=zoutput/slurm_logs/{rule}/{rule}-{wildcards}-%j.out - --error=zoutput/slurm_logs/{rule}/{rule}-{wildcards}-%j.err + --output=zoutput/{log_dir}/{rule}/{rule}-{wildcards}-%j.out + --error=zoutput/{log_dir}/{rule}/{rule}-{wildcards}-%j.err default-resources: - partition=gpu_p - qos=gpu_normal From 422aaa513257bf1e2d4f37e011735b84f13c96fc Mon Sep 17 00:00:00 2001 From: smilesun Date: Mon, 8 Jul 2024 15:36:40 +0200 Subject: [PATCH 3/7] define resources in each rule --- domainlab/exp_protocol/benchmark.smk | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/domainlab/exp_protocol/benchmark.smk b/domainlab/exp_protocol/benchmark.smk index fcc9eb4f3..d33f29a59 100644 --- a/domainlab/exp_protocol/benchmark.smk +++ b/domainlab/exp_protocol/benchmark.smk @@ -72,6 +72,8 @@ rule parameter_sampling: expand("{path}", path=config_path) output: dest=expand("{output_dir}/hyperparameters.csv", output_dir=config["output_dir"]) + resources: + log_dir="slurm_logs_test" params: sampling_seed=os.environ["DOMAINLAB_CUDA_HYPERPARAM_SEED"] run: @@ -107,6 +109,8 @@ rule parameter_sampling: rule run_experiment: input: param_file=rules.parameter_sampling.output + resources: + log_dir="slurm_logs_test" output: # snakemake keyword temporary for temporary directory # like f-string in python {index} is generated in the run block as wildcards @@ -159,6 +163,8 @@ rule agg_results: # put different csv file in a big csv file input: exp_results=experiment_result_files + resources: + log_dir="slurm_logs_test" output: out_file=expand("{output_dir}/results.csv", output_dir=config["output_dir"]) run: From 75d3aca73b8951acbbdda061c3f9dd129b451433 Mon Sep 17 00:00:00 2001 From: smilesun Date: Thu, 11 Jul 2024 11:54:08 +0200 Subject: [PATCH 4/7] slurm custom dir works --- examples/yaml/slurm/config.yaml | 6 +++--- run_benchmark_slurm.sh | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/yaml/slurm/config.yaml b/examples/yaml/slurm/config.yaml index fb229c922..6775d8a75 100644 --- a/examples/yaml/slurm/config.yaml +++ b/examples/yaml/slurm/config.yaml @@ -1,6 +1,6 @@ # This yaml file has been adapted from https://github.com/jdblischak/smk-simple-slurm cluster: - mkdir -p zoutput/{log_dir}/{rule} && + mkdir -p $logdir/{rule} && sbatch --partition=gpu_p --qos=gpu_normal @@ -10,8 +10,8 @@ cluster: -c 2 --mem=160G --job-name=smk-{rule}-{wildcards} - --output=zoutput/{log_dir}/{rule}/{rule}-{wildcards}-%j.out - --error=zoutput/{log_dir}/{rule}/{rule}-{wildcards}-%j.err + --output=zoutput/$logdir/{rule}/{rule}-{wildcards}-%j.out + --error=zoutput/$logdir/{rule}/{rule}-{wildcards}-%j.err default-resources: - partition=gpu_p - qos=gpu_normal diff --git a/run_benchmark_slurm.sh b/run_benchmark_slurm.sh index 36aeb7c8d..5ad2b94ff 100755 --- a/run_benchmark_slurm.sh +++ b/run_benchmark_slurm.sh @@ -32,4 +32,7 @@ echo "Number of GPUs: $NUMBER_GPUS" echo "Results will be stored in: $results_dir" # Helmholtz -snakemake --config logdir="zoutput/benchmark/logs" --profile "examples/yaml/slurm" --config yaml_file="$CONFIGFILE" --keep-going --keep-incomplete --notemp --cores 3 -s "domainlab/exp_protocol/benchmark.smk" --configfile "$CONFIGFILE" --config output_dir="$results_dir" 2>&1 | tee "$logfile" +export logdir="${results_dir}/slurm_logs/" +echo "slurm logs going into ${logdir}" +# snakemake --config logdir="zoutput/benchmark/logs" does not seem to work +snakemake --profile "examples/yaml/slurm" --config yaml_file="$CONFIGFILE" --keep-going --keep-incomplete --notemp --cores 3 -s "domainlab/exp_protocol/benchmark.smk" --configfile "$CONFIGFILE" --config output_dir="$results_dir" 2>&1 | tee "$logfile" From eb445cb93cf5c42b532e03b74d5636a293cb343a Mon Sep 17 00:00:00 2001 From: Xudong Sun Date: Thu, 11 Jul 2024 14:44:29 +0200 Subject: [PATCH 5/7] Update benchmark.smk --- domainlab/exp_protocol/benchmark.smk | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/domainlab/exp_protocol/benchmark.smk b/domainlab/exp_protocol/benchmark.smk index d33f29a59..84661ee61 100644 --- a/domainlab/exp_protocol/benchmark.smk +++ b/domainlab/exp_protocol/benchmark.smk @@ -72,8 +72,8 @@ rule parameter_sampling: expand("{path}", path=config_path) output: dest=expand("{output_dir}/hyperparameters.csv", output_dir=config["output_dir"]) - resources: - log_dir="slurm_logs_test" + # resources: + # log_dir="slurm_logs_test" params: sampling_seed=os.environ["DOMAINLAB_CUDA_HYPERPARAM_SEED"] run: @@ -109,8 +109,6 @@ rule parameter_sampling: rule run_experiment: input: param_file=rules.parameter_sampling.output - resources: - log_dir="slurm_logs_test" output: # snakemake keyword temporary for temporary directory # like f-string in python {index} is generated in the run block as wildcards @@ -163,8 +161,8 @@ rule agg_results: # put different csv file in a big csv file input: exp_results=experiment_result_files - resources: - log_dir="slurm_logs_test" + # resources: + # log_dir="slurm_logs_test" output: out_file=expand("{output_dir}/results.csv", output_dir=config["output_dir"]) run: From bac7907ca513a450c416bac898165ff3f210c687 Mon Sep 17 00:00:00 2001 From: Xudong Sun Date: Thu, 11 Jul 2024 14:47:16 +0200 Subject: [PATCH 6/7] Update config.yaml --- examples/yaml/slurm/config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/yaml/slurm/config.yaml b/examples/yaml/slurm/config.yaml index 6775d8a75..642367adc 100644 --- a/examples/yaml/slurm/config.yaml +++ b/examples/yaml/slurm/config.yaml @@ -10,8 +10,8 @@ cluster: -c 2 --mem=160G --job-name=smk-{rule}-{wildcards} - --output=zoutput/$logdir/{rule}/{rule}-{wildcards}-%j.out - --error=zoutput/$logdir/{rule}/{rule}-{wildcards}-%j.err + --output=$logdir/{rule}/{rule}-{wildcards}-%j.out + --error=$logdir/{rule}/{rule}-{wildcards}-%j.err default-resources: - partition=gpu_p - qos=gpu_normal From d72a8e45b205eefcd01468d7160244e04d7211c4 Mon Sep 17 00:00:00 2001 From: smilesun Date: Thu, 11 Jul 2024 14:58:44 +0200 Subject: [PATCH 7/7] update doc --- docs/doc_benchmark.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/doc_benchmark.md b/docs/doc_benchmark.md index 41b19bffb..ec73fb36e 100644 --- a/docs/doc_benchmark.md +++ b/docs/doc_benchmark.md @@ -74,10 +74,10 @@ hyperparameter sampling and pytorch. The following script will help to find out which job has failed and the error message, so that you could direct to the specific log file ```cluster -bash ./sh_list_error.sh ./zoutput/slurm_logs +bash ./sh_list_error.sh ./zoutput/benchmarks/[output folder of the sepcifed benchmark in the yaml file]/slurm_logs ``` #### Map between slurm job id and sampled hyperparameter index -suppose the slurm job id is 14144163, one could the corresponding log file in `./zoutput/slurm_logs` folder via +suppose the slurm job id is 14144163, one could the corresponding log file in `./zoutput/[output folder of the sepcifed benchmark in the yaml file]/slurm_logs` folder via `find . | grep -i "14144163"` the results can be