Skip to content

Commit ee31aae

Browse files
committed
Release 0.14.0
* Fix several bugs * Improve device cleanup to be fully parallel * Improve db connection timeout error * Update dependencies
1 parent 9f3aa2b commit ee31aae

11 files changed

Lines changed: 291 additions & 238 deletions

File tree

eos/campaigns/campaign_executor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ async def start_campaign(self, db: AsyncDbSession) -> None:
6969
await self._initialize_new_campaign(db)
7070

7171
await self._campaign_manager.start_campaign(db, self._campaign_name)
72-
await db.flush()
72+
await db.commit()
7373

7474
self._campaign_status = CampaignStatus.RUNNING
7575
log.info(f"Started campaign '{self._campaign_name}'")

eos/configuration/validation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def _validate_device_types(self) -> None:
105105
for device_name, device in self._lab.devices.items():
106106
if not self._device_specs.get_spec_by_config(device):
107107
batch_error(
108-
f"Device type '{device.name}' of device '{device_name}' does not exist.",
108+
f"Device type '{device.type}' of device '{device_name}' does not exist.",
109109
EosLabConfigurationError,
110110
)
111111
raise_batched_errors(EosLabConfigurationError)
@@ -120,7 +120,7 @@ def _validate_device_init_parameters(self) -> None:
120120
if param_name not in spec_params:
121121
batch_error(
122122
f"Invalid initialization parameter '{param_name}' for device '{device_name}' "
123-
f"of type '{device.name}' in lab type '{self._lab.name}'. "
123+
f"of type '{device.type}' in lab type '{self._lab.name}'. "
124124
f"Valid parameters are: {', '.join(spec_params.keys())}",
125125
EosLabConfigurationError,
126126
)

eos/database/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
11
class EosFileDbError(Exception):
22
pass
3+
4+
5+
class DbConnectionError(Exception):
6+
"""Raised when database connection fails (e.g., timeout, connection refused)."""

eos/database/postgresql_db_interface.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
AsyncEngine,
77
create_async_engine,
88
)
9-
from eos.logging.logger import log
9+
1010
from eos.database.abstract_sql_db_interface import AbstractSqlDbInterface, Base
1111
from eos.database.alembic_commands import alembic_upgrade, alembic_downgrade
12+
from eos.database.exceptions import DbConnectionError
13+
from eos.logging.logger import log
1214

1315

1416
class PostgresqlDbInterface(AbstractSqlDbInterface):
@@ -72,9 +74,12 @@ def _create_system_engine(self) -> AsyncEngine:
7274
async def initialize_database(self) -> None:
7375
"""Initialize database by creating it if needed and running migrations.
7476
75-
:raises Exception: If initialization fails
77+
:raises DbConnectionError: If connection to database fails
78+
:raises Exception: If initialization fails for other reasons
7679
"""
77-
log.info(f"Connecting to database at {self._db_config.postgres.host}:{self._db_config.postgres.port}...")
80+
host = self._db_config.postgres.host
81+
port = self._db_config.postgres.port
82+
log.info(f"Connecting to database at {host}:{port}...")
7883
try:
7984
exists = await self._database_exists()
8085
if not exists:
@@ -87,6 +92,15 @@ async def initialize_database(self) -> None:
8792
await conn.run_sync(Base.metadata.create_all)
8893

8994
log.info(f"Connected to database '{self._db_name}'")
95+
except TimeoutError as e:
96+
raise DbConnectionError(
97+
f"Connection to database at {host}:{port} timed out. "
98+
"Please check that PostgreSQL is running and accessible."
99+
) from e
100+
except OSError as e:
101+
raise DbConnectionError(
102+
f"Failed to connect to database at {host}:{port}. Please check that PostgreSQL is running. Error: {e}"
103+
) from e
90104
except Exception as e:
91105
log.error(f"Failed to initialize database: {e!s}")
92106
raise

eos/devices/device_manager.py

Lines changed: 86 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from eos.configuration.configuration_manager import ConfigurationManager
1010
from eos.configuration.constants import EOS_COMPUTER_NAME
11+
from eos.allocation.entities.device_allocation import DeviceAllocationModel
1112
from eos.devices.entities.device import Device, DeviceStatus, DeviceModel
1213
from eos.devices.exceptions import EosDeviceStateError, EosDeviceInitializationError
1314
from eos.logging.batch_error_logger import batch_error, raise_batched_errors
@@ -125,15 +126,14 @@ async def reload_devices(self, db: AsyncDbSession, lab_name: str, device_names:
125126
raise
126127

127128
# Cleanup the specific device actors
128-
reload_tasks = []
129-
130-
for device_name in device_names:
131-
actor_name = f"{lab_name}.{device_name}"
132-
if actor_name in self._device_actor_handles:
133-
reload_tasks.append(self._cleanup_single_device(actor_name))
129+
actors_to_cleanup = [
130+
f"{lab_name}.{device_name}"
131+
for device_name in device_names
132+
if f"{lab_name}.{device_name}" in self._device_actor_handles
133+
]
134134

135-
if reload_tasks:
136-
await asyncio.gather(*reload_tasks)
135+
if actors_to_cleanup:
136+
await self._cleanup_device_actors_with_timeout(actors_to_cleanup)
137137

138138
# Remove device records from database
139139
await db.execute(
@@ -183,69 +183,96 @@ async def cleanup_device_actors(self, db: AsyncDbSession, lab_names: list[str] |
183183
if not actor_names:
184184
return
185185

186-
cleanup_tasks = [
187-
self._cleanup_single_device(actor_name)
188-
for actor_name in actor_names
189-
if actor_name in self._device_actor_handles
190-
]
191-
192-
if cleanup_tasks:
193-
await asyncio.gather(*cleanup_tasks)
186+
actors_to_cleanup = [name for name in actor_names if name in self._device_actor_handles]
187+
if actors_to_cleanup:
188+
await self._cleanup_device_actors_with_timeout(actors_to_cleanup)
194189

195190
await self.cleanup_devices(db, lab_names)
196191

197-
async def _get_actor_names_to_cleanup(self, db: AsyncDbSession, lab_names: list[str] | None) -> list[str]:
198-
"""Get actor names that need to be cleaned up."""
199-
if not lab_names:
200-
return list(self._device_actor_handles.keys())
192+
async def _cleanup_device_actors_with_timeout(self, actor_names: list[str], cleanup_timeout: float = 30.0) -> None:
193+
"""Clean up multiple device actors concurrently with a timeout."""
194+
# Start cleanup on all actors and collect the object refs
195+
cleanup_refs: dict[ray.ObjectRef, str] = {}
196+
for actor_name in actor_names:
197+
actor_handle = self._device_actor_handles.get(actor_name)
198+
if actor_handle is None:
199+
continue
201200

202-
result = await db.execute(select(DeviceModel).where(DeviceModel.lab_name.in_(lab_names)))
203-
devices = [Device.model_validate(device) for device in result.scalars()]
204-
return [device.get_actor_name() for device in devices]
201+
try:
202+
log.info(f"Cleaning up device '{actor_name}'...")
203+
cleanup_ref = actor_handle.cleanup.remote()
204+
cleanup_refs[cleanup_ref] = actor_name
205+
except Exception as e:
206+
log.error(f"Failed to start cleanup for device '{actor_name}': {e}")
207+
self._forcefully_kill_actor(actor_name)
205208

206-
async def _cleanup_single_device(self, actor_name: str) -> None:
207-
"""Clean up a single device actor with timeout.
209+
if not cleanup_refs:
210+
return
208211

209-
Attempts to gracefully clean up a device actor. If the cleanup
210-
doesn't complete within 30 seconds, forcefully kills the actor.
212+
# Process cleanups as they complete, with overall timeout
213+
pending_refs = set(cleanup_refs.keys())
214+
start_time = asyncio.get_event_loop().time()
211215

212-
:param actor_name: The name of the actor to clean up
213-
"""
214-
if actor_name not in self._device_actor_handles:
215-
return
216+
while pending_refs:
217+
elapsed = asyncio.get_event_loop().time() - start_time
218+
remaining_timeout = max(0, cleanup_timeout - elapsed)
216219

217-
actor_handle = self._device_actor_handles[actor_name]
218-
success = False
219-
cleanup_timeout = 30.0
220+
if remaining_timeout <= 0:
221+
break
220222

221-
try:
222-
log.info(f"Cleaning up device actor '{actor_name}'...")
223-
cleanup_ref = actor_handle.cleanup.remote()
223+
ready_refs, _ = ray.wait(
224+
list(pending_refs),
225+
num_returns=1,
226+
timeout=remaining_timeout,
227+
)
224228

225-
# Wait for cleanup to complete with timeout
226-
ready_refs, _ = ray.wait([cleanup_ref], timeout=cleanup_timeout)
229+
if not ready_refs:
230+
# Timeout reached with no more completions
231+
break
227232

228-
if cleanup_ref in ready_refs:
229-
log.info(f"Cleaned up device actor '{actor_name}'")
230-
success = True
231-
else:
232-
log.warning(
233-
f"Timed out cleaning up device actor '{actor_name}' after {cleanup_timeout} seconds, "
234-
f"will forcefully kill..."
235-
)
236-
except Exception as e:
237-
log.error(f"Failed cleaning up device actor '{actor_name}': {e}")
238-
finally:
239-
# Kill if cleanup wasn't successful
240-
if not success and actor_name in self._device_actor_handles:
233+
for ref in ready_refs:
234+
pending_refs.discard(ref)
235+
actor_name = cleanup_refs[ref]
241236
try:
242-
log.warning(f"Forcefully killing device actor '{actor_name}'")
243-
ray.kill(self._device_actor_handles[actor_name])
237+
ray.get(ref) # Check for exceptions
238+
log.info(f"Cleaned up device '{actor_name}'")
244239
except Exception as e:
245-
log.error(f"Error killing device actor '{actor_name}': {e}")
240+
log.error(f"Cleanup failed for device '{actor_name}': {e}")
241+
self._forcefully_kill_actor(actor_name)
242+
finally:
243+
self._remove_device_references(actor_name)
244+
245+
# Forcefully kill actors that timed out
246+
if pending_refs:
247+
timed_out_actors = [cleanup_refs[ref] for ref in pending_refs]
248+
log.warning(
249+
f"Timed out cleaning up {len(timed_out_actors)} device(s) after {cleanup_timeout} seconds: "
250+
f"{', '.join(timed_out_actors)}"
251+
)
252+
for ref in pending_refs:
253+
actor_name = cleanup_refs[ref]
254+
self._forcefully_kill_actor(actor_name)
255+
self._remove_device_references(actor_name)
256+
257+
def _forcefully_kill_actor(self, actor_name: str) -> None:
258+
"""Forcefully kill a device actor."""
259+
if actor_name not in self._device_actor_handles:
260+
return
261+
262+
try:
263+
log.warning(f"Forcefully killing device '{actor_name}'")
264+
ray.kill(self._device_actor_handles[actor_name])
265+
except Exception as e:
266+
log.error(f"Error killing device '{actor_name}': {e}")
267+
268+
async def _get_actor_names_to_cleanup(self, db: AsyncDbSession, lab_names: list[str] | None) -> list[str]:
269+
"""Get actor names that need to be cleaned up."""
270+
if not lab_names:
271+
return list(self._device_actor_handles.keys())
246272

247-
# Clean up references regardless of success
248-
self._remove_device_references(actor_name)
273+
result = await db.execute(select(DeviceModel).where(DeviceModel.lab_name.in_(lab_names)))
274+
devices = [Device.model_validate(device) for device in result.scalars()]
275+
return [device.get_actor_name() for device in devices]
249276

250277
def _remove_device_references(self, actor_name: str) -> None:
251278
"""Remove device references from internal tracking dictionaries."""
@@ -255,9 +282,11 @@ def _remove_device_references(self, actor_name: str) -> None:
255282
async def cleanup_devices(self, db: AsyncDbSession, lab_names: list[str] | None = None) -> None:
256283
"""Remove device records from the database."""
257284
if lab_names:
285+
await db.execute(delete(DeviceAllocationModel).where(DeviceAllocationModel.lab_name.in_(lab_names)))
258286
await db.execute(delete(DeviceModel).where(DeviceModel.lab_name.in_(lab_names)))
259287
log.debug(f"Cleaned up devices for lab(s): {', '.join(lab_names)}")
260288
else:
289+
await db.execute(delete(DeviceAllocationModel))
261290
await db.execute(delete(DeviceModel))
262291
log.debug("Cleaned up all devices")
263292

eos/experiments/experiment_executor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ async def start_experiment(self, db: AsyncDbSession) -> None:
7171
)
7272

7373
await self._experiment_manager.start_experiment(db, self._experiment_name)
74+
await db.commit()
7475
self._experiment_status = ExperimentStatus.RUNNING
7576

7677
action = "Resumed" if self._experiment_submission.resume else "Started"

eos/orchestration/orchestrator.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ async def initialize(self) -> None:
111111
di.register(WorkSignal, self._work_signal)
112112

113113
# State management ########################################
114+
allocation_manager = AllocationManager()
115+
async with db_interface.get_async_session() as db:
116+
await allocation_manager.initialize(db)
117+
di.register(AllocationManager, allocation_manager)
118+
114119
device_manager = DeviceManager()
115120
async with db_interface.get_async_session() as db:
116121
await device_manager.cleanup_devices(db)
@@ -121,11 +126,6 @@ async def initialize(self) -> None:
121126
await resource_manager.initialize(db)
122127
di.register(ResourceManager, resource_manager)
123128

124-
allocation_manager = AllocationManager()
125-
async with db_interface.get_async_session() as db:
126-
await allocation_manager.initialize(db)
127-
di.register(AllocationManager, allocation_manager)
128-
129129
task_manager = TaskManager()
130130
di.register(TaskManager, task_manager)
131131

eos/orchestration/services/lab_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ async def get_lab_devices(
4545
filtered_devices = {
4646
name: device
4747
for name, device in lab.devices.items()
48-
if not task_device_types or device.name in task_device_types
48+
if not task_device_types or device.type in task_device_types
4949
}
5050

5151
if filtered_devices:

eos/orchestration/services/loading_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ async def reload_labs(self, db: AsyncDbSession, lab_types: set[str]) -> None:
5555
"""Reload one or more labs in the orchestrator with updated device plugin code."""
5656
for lab_type in lab_types:
5757
lab = self._configuration_manager.package_manager.read_lab(lab_type)
58-
device_types = {device.name for device in lab.devices.values()}
58+
device_types = {device.type for device in lab.devices.values()}
5959
for device_type in device_types:
6060
try:
6161
self._configuration_manager.devices.reload_plugin(device_type)

pyproject.toml

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "eos"
3-
version = "0.13.0"
3+
version = "0.14.0"
44
description = "EOS is a comprehensive software framework and runtime for laboratory automation."
55
keywords = ["automation", "science", "lab", "experiment", "orchestration", "distributed", "infrastructure"]
66
authors = [
@@ -22,39 +22,39 @@ classifiers = [
2222
readme = "README.md"
2323
requires-python = ">=3.11"
2424
dependencies = [
25-
"ray[default]==2.52.1",
26-
"typer==0.20.0",
27-
"rich==14.2.0",
28-
"PyYAML==6.0.3",
29-
"jinja2==3.1.6",
30-
"pydantic==2.12.5",
31-
"pydantic-settings==2.12.0",
32-
"networkx==3.6.0",
33-
"sqlalchemy==2.0.44",
34-
"alembic==1.17.2",
35-
"psycopg[binary]==3.3.1",
36-
"asyncpg==0.31.0",
37-
"minio==7.2.18",
38-
"bofire[optimization]==0.3.0",
39-
"pandas==2.3.3",
25+
"ray[default]>=2.53.0",
26+
"typer>=0.21.1",
27+
"rich>=14.2.0",
28+
"PyYAML>=6.0.3",
29+
"jinja2>=3.1.6",
30+
"pydantic>=2.12.5",
31+
"pydantic-settings>=2.12.0",
32+
"networkx>=3.6.1",
33+
"sqlalchemy>=2.0.45",
34+
"alembic>=1.18.0",
35+
"psycopg[binary]>=3.3.2",
36+
"asyncpg>=0.31.0",
37+
"minio>=7.2.20",
38+
"bofire[optimization]>=0.3.1",
39+
"pandas>=2.3.3",
4040
"ortools<=9.12",
41-
"litestar[standard]==2.18.0",
41+
"litestar[standard]>=2.19.0",
4242
]
4343

4444
[dependency-groups]
4545
worker = [
46-
"ray[default]==2.52.1",
47-
"typer==0.20.0",
48-
"rich==14.2.0",
49-
"pydantic==2.12.5",
50-
"pydantic-settings==2.12.0",
51-
"sqlalchemy==2.0.44",
52-
"alembic==1.17.2",
46+
"ray[default]>=2.53.0",
47+
"typer>=0.21.1",
48+
"rich>=14.2.0",
49+
"pydantic>=2.12.5",
50+
"pydantic-settings>=2.12.0",
51+
"sqlalchemy>=2.0.45",
52+
"alembic>=1.18.0",
5353
]
5454
optimizer_worker = [
5555
{ include-group = "worker" },
56-
"bofire[optimization]==0.3.0",
57-
"pandas==2.3.3",
56+
"bofire[optimization]>=0.3.1",
57+
"pandas>=2.3.3",
5858
]
5959

6060
sila2 = [

0 commit comments

Comments
 (0)