Skip to content

Commit bd031fe

Browse files
committed
handle CP2K out-of-memory using ulimit
1 parent 382f744 commit bd031fe

File tree

7 files changed

+46
-20
lines changed

7 files changed

+46
-20
lines changed

configs/hortense.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ ModelTraining:
3030
CP2K:
3131
cores_per_worker: 64
3232
max_evaluation_time: 30
33-
launch_command: 'apptainer exec --memory 50000000000 -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -np 32 -bind-to core cp2k.psmp -i cp2k.inp'
33+
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -np 32 -bind-to core cp2k.psmp'
3434
slurm:
3535
partition: "cpu_rome"
3636
account: "2024_079"

configs/lumi.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ default_threads: 8
66
CP2K:
77
cores_per_worker: 32
88
max_evaluation_time: 20
9-
launch_command: 'singularity exec --memory 50000000000 -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -np 32 cp2k.psmp -i cp2k.inp'
9+
launch_command: 'singularity exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -np 32 cp2k.psmp'
1010
slurm:
1111
partition: "standard"
1212
account: "project_465001125"

configs/threadpool.yaml

+5-4
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,18 @@ ModelTraining:
1313
CP2K:
1414
cores_per_worker: 2
1515
max_evaluation_time: 0.3
16-
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -bind-to core -np 2 -env OMP_NUM_THREADS 1 cp2k.psmp -i cp2k.inp'
16+
memory_limit: 2GB
17+
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -bind-to core -np 2 -env OMP_NUM_THREADS 1 cp2k.psmp'
1718
CP2K_container:
1819
cores_per_worker: 2
1920
max_evaluation_time: 0.3
20-
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -bind-to core -np 2 -env OMP_NUM_THREADS 1 cp2k.psmp -i cp2k.inp'
21+
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2024.1 /opt/entry.sh mpirun -bind-to core -np 2 -env OMP_NUM_THREADS 1 cp2k.psmp'
2122
GPAW:
2223
cores_per_worker: 2
2324
max_evaluation_time: 0.3
24-
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/gpaw:24.1 /opt/entry.sh mpirun -np 2 gpaw python /opt/run_gpaw.py input.json'
25+
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/gpaw:24.1 /opt/entry.sh mpirun -np 2 gpaw python /opt/run_gpaw.py'
2526
GPAW_container:
2627
cores_per_worker: 2
2728
max_evaluation_time: 0.3
28-
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/gpaw:24.1 /opt/entry.sh mpirun -np 2 gpaw python /opt/run_gpaw.py input.json'
29+
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/gpaw:24.1 /opt/entry.sh mpirun -np 2 gpaw python /opt/run_gpaw.py'
2930
...

configs/wq.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,5 @@ ModelTraining:
1313
CP2K:
1414
cores_per_worker: 2
1515
max_evaluation_time: 0.3
16-
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2023.2 /opt/entry.sh mpirun -np 2 -x OMP_NUM_THREADS=1 cp2k.psmp -i cp2k.inp'
16+
launch_command: 'apptainer exec -e --no-init oras://ghcr.io/molmod/cp2k:2023.2 /opt/entry.sh mpirun -np 2 -x OMP_NUM_THREADS=1 cp2k.psmp'
1717
...

psiflow/execution.py

+35-11
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations # necessary for type-guarding class methods
22

3+
import re
4+
35
import logging
46
import math
57
import shutil
@@ -301,18 +303,22 @@ def __init__(
301303
name: str,
302304
launch_command: Optional[str] = None,
303305
max_evaluation_time: Optional[float] = None,
306+
memory_limit: Optional[str] = None,
304307
**kwargs,
305308
) -> None:
306309
super().__init__(**kwargs)
307310
self.name = name # override name
311+
312+
if launch_command is None:
313+
launch_command = self.default_launch_command
314+
self.launch_command = launch_command
315+
308316
if max_evaluation_time is None:
309317
max_evaluation_time = self.max_runtime / 60
310318
assert max_evaluation_time * 60 <= self.max_runtime
311319
self.max_evaluation_time = max_evaluation_time
312320

313-
if launch_command is None:
314-
launch_command = self.default_launch_command
315-
self.launch_command = launch_command
321+
self.memory_limit = memory_limit
316322

317323
@property
318324
def default_launch_command(self):
@@ -334,14 +340,32 @@ def default_launch_command(self):
334340
raise ValueError('provide path to ORCA executable via "launch_command"')
335341

336342
def command(self):
337-
max_time = 0.9 * (60 * self.max_evaluation_time)
338-
command = " ".join(
339-
[
340-
"timeout -s 9 {}s".format(max_time),
341-
self.launch_command,
342-
"|| true",
343-
]
344-
)
343+
extra_commands = []
344+
345+
launch_command = self.launch_command
346+
if self.name.startswith("CP2K"):
347+
launch_command += " -i cp2k.inp" # add input file
348+
elif self.name.startswith("GPAW"):
349+
launch_command += " input.json"
350+
if self.max_evaluation_time is not None:
351+
max_time = 0.9 * (60 * self.max_evaluation_time)
352+
launch_command = "timeout -s 9 {}s {}".format(max_time, launch_command)
353+
if self.memory_limit is not None:
354+
# based on https://stackoverflow.com/a/42865957/2002471
355+
units = {"B": 1, "KB": 2**10, "MB": 2**20, "GB": 2**30, "TB": 2**40}
356+
357+
def parse_size(size):
358+
size = size.upper()
359+
if not re.match(r' ', size):
360+
size = re.sub(r'([KMGT]?B)', r' \1', size)
361+
number, unit = [string.strip() for string in size.split()]
362+
return int(float(number) * units[unit])
363+
364+
actual = parse_size(self.memory_limit)
365+
extra_commands.append("ulimit -v {}".format(actual))
366+
367+
body = "; ".join(extra_commands + [launch_command, "exit 0"]) + "; "
368+
command = " { " + body + " } "
345369
return command
346370

347371
def wq_resources(self):

psiflow/reference/_cp2k.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,8 @@ def get_single_atom_references(self, element):
260260
"ot": {"minimizer": "CG"}
261261
}
262262
# necessary for oxygen calculation, at least in 2024.1
263-
cp2k_input_dict['force_eval']['dft']['scf']['ignore_convergence_failure'] = True
263+
key = 'ignore_convergence_failure'
264+
cp2k_input_dict['force_eval']['dft']['scf'][key] = "TRUE"
264265

265266
reference = CP2K(
266267
dict_to_str(cp2k_input_dict),

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ dependencies = [
1616
"parsl @ git+https://github.com/parsl/parsl.git@benc-status-refactor",
1717
"prettytable",
1818
"psutil",
19-
"cp2k-input-tools",
19+
"cp2k-input-tools @ git+https://github.com/cp2k/cp2k-input-tools", # need 2024.1
2020
"pytimeparse",
2121
]
2222

0 commit comments

Comments
 (0)