Merge pull request #849 from marrlab/benchmark_slurm_log

smilesun · web-flow · commit 344936b71b76 · 2024-07-11T15:01:24.000+02:00
Benchmark slurm log folder custom
diff --git a/docs/doc_benchmark.md b/docs/doc_benchmark.md
@@ -74,10 +74,10 @@ hyperparameter sampling and pytorch.
 The following script will help to find out which job has failed and the error message, so that you could direct to the
 specific log file
 ```cluster
-bash ./sh_list_error.sh ./zoutput/slurm_logs
+bash ./sh_list_error.sh ./zoutput/benchmarks/[output folder of the sepcifed benchmark in the yaml file]/slurm_logs
 ```
 #### Map between slurm job id and sampled hyperparameter index
-suppose the slurm job id is 14144163, one could the corresponding log file in `./zoutput/slurm_logs` folder via
+suppose the slurm job id is 14144163, one could the corresponding log file in `./zoutput/[output folder of the sepcifed benchmark in the yaml file]/slurm_logs` folder via
 `find . | grep -i "14144163"`
 
 the results can be
diff --git a/domainlab/exp_protocol/benchmark.smk b/domainlab/exp_protocol/benchmark.smk
@@ -72,6 +72,8 @@ rule parameter_sampling:
         expand("{path}", path=config_path)
     output:
         dest=expand("{output_dir}/hyperparameters.csv", output_dir=config["output_dir"])
+    # resources:
+    #    log_dir="slurm_logs_test"
     params:
         sampling_seed=os.environ["DOMAINLAB_CUDA_HYPERPARAM_SEED"]
     run:
@@ -159,6 +161,8 @@ rule agg_results:
     # put different csv file in a big csv file
     input:
         exp_results=experiment_result_files
+    # resources:
+    #    log_dir="slurm_logs_test"
     output:
         out_file=expand("{output_dir}/results.csv", output_dir=config["output_dir"])
     run:
diff --git a/examples/yaml/slurm/config.yaml b/examples/yaml/slurm/config.yaml
@@ -1,6 +1,6 @@
 # This yaml file has been adapted from https://github.com/jdblischak/smk-simple-slurm
 cluster:
-  mkdir -p zoutput/slurm_logs/{rule} &&
+  mkdir -p $logdir/{rule} &&
   sbatch
     --partition=gpu_p
     --qos=gpu_normal
@@ -10,8 +10,8 @@ cluster:
     -c 2
     --mem=160G
     --job-name=smk-{rule}-{wildcards}
-    --output=zoutput/slurm_logs/{rule}/{rule}-{wildcards}-%j.out
-    --error=zoutput/slurm_logs/{rule}/{rule}-{wildcards}-%j.err
+    --output=$logdir/{rule}/{rule}-{wildcards}-%j.out
+    --error=$logdir/{rule}/{rule}-{wildcards}-%j.err
 default-resources:
   - partition=gpu_p
   - qos=gpu_normal
diff --git a/run_benchmark_slurm.sh b/run_benchmark_slurm.sh
@@ -32,4 +32,7 @@ echo "Number of GPUs: $NUMBER_GPUS"
 echo "Results will be stored in: $results_dir"
 
 # Helmholtz
-snakemake --profile "examples/yaml/slurm" --config yaml_file="$CONFIGFILE" --keep-going --keep-incomplete --notemp --cores 3 -s "domainlab/exp_protocol/benchmark.smk" --configfile "$CONFIGFILE" --config output_dir="$results_dir" 2>&1 | tee "$logfile"
+export logdir="${results_dir}/slurm_logs/"
+echo "slurm logs going into ${logdir}"
+# snakemake --config logdir="zoutput/benchmark/logs" does not seem to work
+snakemake --profile "examples/yaml/slurm" --config yaml_file="$CONFIGFILE" --keep-going --keep-incomplete --notemp --cores 3 -s "domainlab/exp_protocol/benchmark.smk" --configfile "$CONFIGFILE" --config output_dir="$results_dir" 2>&1 | tee "$logfile"