Skip to content

Commit

Permalink
Activity count feature addition (#249)
Browse files Browse the repository at this point in the history
  • Loading branch information
simon-p-2000 authored Feb 13, 2025
1 parent 26f6dcb commit 93d6325
Show file tree
Hide file tree
Showing 4 changed files with 253 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## [0.5.9] - 2025-02-01
### Added
- Add support to store NSRR token in environment variable or user config ([#243](https://github.com/cbrnr/sleepecg/pull/243) by [Simon Pusterhofer](https://github.com/simon-p-2000))
- Add support for downloading and storing activity counts for the MESA dataset ([#249](https://github.com/cbrnr/sleepecg/pull/249) by [Simon Pusterhofer](https://github.com/simon-p-2000))
- Add Python 3.13+ support by transforming wheel builds using ABI3 mode ([#251](https://github.com/cbrnr/sleepecg/pull/251) by [Eric Larson](https://github.com/larsoner))

### Changed
Expand Down
4 changes: 3 additions & 1 deletion docs/datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Instead of always using [`set_nsrr_token()`](sleepecg.set_nsrr_token), you can s

SleepECG checks for the NSRR token in the following order:

1. Token set via [`set_nsrr_token()`][sleepecg.set_nsrr_token]
1. Token set via [`set_nsrr_token()`](sleepecg.set_nsrr_token)
2. Token set via environment variable `NSRR_TOKEN`
3. Token set in the user configuration

Expand All @@ -59,6 +59,8 @@ set_nsrr_token("<your-download-token-here>")
mesa = read_mesa(records_pattern="00*") # note that this is a generator
```

SleepECG supports downloading and storing activity counts for the MESA dataset. These metrics quantify a subject's movement based on accelerometer measurements recorded and processed using a proprietary algorithm in Philips Actiware. To access activity counts, call [`read_mesa()`](sleepecg.read_mesa) with `activity_source='actigraphy'` to download the data or `activity_source='cached'` to use previously downloaded counts.

!!! note
Reader functions are generators, so they do not return the data directly. To access the data, you need to consume the generator, either by iterating over it or with subsequent calls of `next()`.

Expand Down
150 changes: 147 additions & 3 deletions src/sleepecg/io/sleep_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import csv
import datetime
import os
from collections.abc import Iterator
from dataclasses import dataclass
from enum import IntEnum
Expand Down Expand Up @@ -86,6 +87,8 @@ class SleepRecord:
Times of heartbeats relative to recording start in seconds, by default `None`.
subject_data : SubjectData, optional
Dataclass containing subject data (such as gender or age), by default `None`.
activity_counts: np.ndarray, optional
Activity counts according to Actiwatch actigraphy, by default `None`.
"""

sleep_stages: np.ndarray | None = None
Expand All @@ -94,12 +97,14 @@ class SleepRecord:
recording_start_time: datetime.time | None = None
heartbeat_times: np.ndarray | None = None
subject_data: SubjectData | None = None
activity_counts: np.ndarray | None = None


class _ParseNsrrXmlResult(NamedTuple):
sleep_stages: np.ndarray
sleep_stage_duration: int
recording_start_time: datetime.time
recording_duration: float


def _parse_nsrr_xml(xml_filepath: Path) -> _ParseNsrrXmlResult:
Expand All @@ -120,6 +125,8 @@ def _parse_nsrr_xml(xml_filepath: Path) -> _ParseNsrrXmlResult:
Duration of each sleep stage in seconds.
recording_start_time : datetime.time
Time at which the recording was started.
recording_duration: float
Duration of the recording in seconds.
"""
STAGE_MAPPING = {
Expand All @@ -139,6 +146,12 @@ def _parse_nsrr_xml(xml_filepath: Path) -> _ParseNsrrXmlResult:
raise RuntimeError(f"EpochLength not found in {xml_filepath}.")
epoch_length = int(epoch_length)

scored_event = root.find(".//ScoredEvent")
if scored_event is not None:
recording_duration = float(scored_event.findtext(".//Duration", ""))
else:
raise RuntimeError(f"Recording duration not found in {xml_filepath}.")

start_time = None
annot_stages = []

Expand All @@ -157,9 +170,7 @@ def _parse_nsrr_xml(xml_filepath: Path) -> _ParseNsrrXmlResult:
raise RuntimeError(f"'Recording Start Time' not found in {xml_filepath}.")

return _ParseNsrrXmlResult(
np.array(annot_stages, dtype=np.int8),
epoch_length,
start_time,
np.array(annot_stages, dtype=np.int8), epoch_length, start_time, recording_duration
)


Expand All @@ -169,6 +180,7 @@ def read_mesa(
offline: bool = False,
keep_edfs: bool = False,
data_dir: str | Path | None = None,
activity_source: str | None = None,
) -> Iterator[SleepRecord]:
"""
Lazily read records from [MESA](https://sleepdata.org/datasets/mesa).
Expand Down Expand Up @@ -197,6 +209,10 @@ def read_mesa(
data_dir : str | pathlib.Path, optional
Directory where all datasets are stored. If `None` (default), the value will be
taken from the configuration.
activity_source : {'actigraphy', 'cached', None}, optional
If `None` (default), actigraphy data will not be downloaded. If `'actigraphy'`,
download actigraphy data from MESA dataset. If `'cached'`, get the cached activity
counts.
Yields
------
Expand All @@ -208,7 +224,10 @@ def read_mesa(
DB_SLUG = "mesa"
ANNOTATION_DIRNAME = "polysomnography/annotations-events-nsrr"
EDF_DIRNAME = "polysomnography/edfs"
ACTIVITY_DIRNAME = "actigraphy"
OVERLAP_DIRNAME = "overlap"
HEARTBEATS_DIRNAME = "preprocessed/heartbeats"
ACTIVITY_COUNTS_DIRNAME = "preprocessed/activity_counts"
RPOINTS_DIRNAME = "polysomnography/annotations-rpoints"

GENDER_MAPPING = {0: Gender.FEMALE, 1: Gender.MALE}
Expand All @@ -220,6 +239,13 @@ def read_mesa(
f"possible options: {heartbeats_source_options}"
)

activity_source_options = {None, "cached", "actigraphy"}
if activity_source not in activity_source_options:
raise ValueError(
f"Invalid value for parameter `activity_source`: {activity_source}, "
f"possible options: {activity_source_options}"
)

if data_dir is None:
data_dir = get_config_value("data_dir")

Expand All @@ -231,6 +257,13 @@ def read_mesa(
for directory in (annotations_dir, edf_dir, heartbeats_dir):
directory.mkdir(parents=True, exist_ok=True)

if activity_source is not None:
activity_dir = db_dir / ACTIVITY_DIRNAME
activity_counts_dir = db_dir / ACTIVITY_COUNTS_DIRNAME
overlap_dir = db_dir / OVERLAP_DIRNAME
for directory in (activity_dir, activity_counts_dir, overlap_dir):
directory.mkdir(parents=True, exist_ok=True)

if not offline:
download_url = _get_nsrr_url(DB_SLUG)

Expand Down Expand Up @@ -271,10 +304,37 @@ def read_mesa(
shallow=True,
)
checksums.update(rpoints_files)

if activity_source is not None:
activity_files = _list_nsrr(
db_slug=DB_SLUG,
subfolder="actigraphy",
pattern=f"mesa-sleep-{records_pattern}.csv",
shallow=True,
)
checksums.update(activity_files)
overlap_filename, overlap_checksum = _list_nsrr(
db_slug="mesa", subfolder="overlap", shallow=True
)[0]
overlap_filepath = db_dir / overlap_filename
_download_nsrr_file(
download_url + overlap_filename,
target_filepath=overlap_filepath,
checksum=overlap_checksum,
)

else:
subject_data_filepath = next((db_dir / "datasets").glob("mesa-sleep-dataset-*.csv"))
xml_paths = annotations_dir.glob(f"mesa-sleep-{records_pattern}-nsrr.xml")
requested_records = sorted([file.stem[:-5] for file in xml_paths])
if activity_source == "actigraphy":
overlap_filename = "mesa-actigraphy-psg-overlap.csv"
overlap_filepath = overlap_dir / overlap_filename
if not overlap_filepath.is_file():
raise RuntimeError(
"Overlap file not found, make sure it is in the correct directory "
"overlap/mesa-actigraphy-psg-overlap.csv."
)

subject_data_array = np.loadtxt(
subject_data_filepath,
Expand All @@ -291,6 +351,16 @@ def read_mesa(
age=age,
)

if activity_source == "actigraphy":
overlap_data = {}

with open(overlap_filepath) as csv_file:
overlap_reader = csv.DictReader(csv_file, delimiter=",")
for entry in overlap_reader:
mesaid = int(entry["mesaid"])
line = int(entry["line"])
overlap_data[mesaid] = line

for record_id in requested_records:
heartbeats_file = heartbeats_dir / f"{record_id}.npy"
if heartbeats_source == "annotation":
Expand Down Expand Up @@ -350,13 +420,87 @@ def read_mesa(

parsed_xml = _parse_nsrr_xml(xml_filepath)

activity_counts = None
if activity_source is not None:
activity_counts_file = activity_counts_dir / f"{record_id}-activity-counts.npy"
if activity_source == "cached":
if not activity_counts_file.is_file():
print(f"Skipping {record_id} due to missing cached activity counts.")
continue
activity_counts = np.load(activity_counts_file)
else:
mesaid = int(record_id.split("-")[2])
if mesaid not in overlap_data:
print(f"Skipping {record_id} due to missing overlap data.")
continue

activity_filename = ACTIVITY_DIRNAME + f"/{record_id}.csv"
activity_filepath = db_dir / activity_filename
if not offline:
_download_nsrr_file(
download_url + activity_filename,
activity_filepath,
checksums[activity_filename],
)

if not os.path.exists(activity_filepath):
print(f"Skipping {record_id} due to missing activity data.")
continue

activity_data = []

with open(activity_filepath) as csv_file:
reader = csv.reader(csv_file, delimiter=",")
header = next(reader)
for row in reader:
activity_data.append(dict(zip(header, row)))

recording_start_time = parsed_xml.recording_start_time
recording_duration = parsed_xml.recording_duration
recording_end_time = datetime.datetime.combine(
datetime.datetime.today(), recording_start_time
) + datetime.timedelta(seconds=recording_duration)

recording_end_time_seconds = recording_end_time.second
rounding_seconds = (
(30 - recording_end_time_seconds % 30)
if recording_end_time_seconds % 30 >= 15
else -(recording_end_time_seconds % 30)
)

recording_end_time = recording_end_time + datetime.timedelta(
seconds=rounding_seconds
)
recording_end_time_str = recording_end_time.strftime("%H:%M:%S").lstrip("0")

start_line = overlap_data[mesaid] + 1

for item in activity_data:
if item.get("linetime") == recording_end_time_str:
end_line = int(item["line"]) - 1
break
else:
print(
f"Skipping {record_id} due to missing line matching "
f"{recording_end_time_str}."
)
continue

activity_counts = [
item["activity"] for item in activity_data[start_line - 1 : end_line]
]

activity_counts = np.array(activity_counts)
np.save(activity_counts_file, activity_counts)

yield SleepRecord(
sleep_stages=parsed_xml.sleep_stages,
sleep_stage_duration=parsed_xml.sleep_stage_duration,
id=record_id,
recording_start_time=parsed_xml.recording_start_time,
heartbeat_times=heartbeat_times,
subject_data=subject_data[record_id],
activity_counts=activity_counts,
)


Expand Down
Loading

0 comments on commit 93d6325

Please sign in to comment.