Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Activity count feature addition #249

Merged
merged 43 commits into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
46be724
Added feature to download and process actigraphy data from mesa datas…
simon-p-2000 Dec 7, 2024
50083cf
reformatting sleep readers, adding dummy data for actigraphy and overlap
simon-p-2000 Dec 8, 2024
7123008
added first test case, fixed errors so test case passes
simon-p-2000 Dec 13, 2024
0cd801f
Update src/sleepecg/io/sleep_readers.py
simon-p-2000 Dec 13, 2024
4833b3f
fixing typo in variable name
simon-p-2000 Dec 13, 2024
d7093b6
no longer using pandas for test_sleep_readers.py
simon-p-2000 Dec 15, 2024
9364512
reworked sleep_readers.py so pandas is not used, reworked test case t…
simon-p-2000 Dec 16, 2024
df4f33f
Added more tests, fixed annotation error
simon-p-2000 Dec 19, 2024
9828529
style fixes
simon-p-2000 Dec 19, 2024
548725d
removed test case that relied on online data
simon-p-2000 Dec 20, 2024
acc783e
fixed mypy errors, added caching of processed actigraphy files
simon-p-2000 Dec 23, 2024
7dff0c3
more mypy error fixes
simon-p-2000 Dec 23, 2024
932ad74
further style fixes
simon-p-2000 Dec 23, 2024
38284b2
added feature extraction for actigraphy, reworked tests
simon-p-2000 Jan 2, 2025
e26c07b
reworked style
simon-p-2000 Jan 2, 2025
4322d06
style fix in feature extraction
simon-p-2000 Jan 2, 2025
392fdd0
Fix style
cbrnr Jan 14, 2025
51dcffb
Simplify
cbrnr Jan 14, 2025
cde13c5
Applying changes proposed in PR review
simon-p-2000 Jan 14, 2025
aa93f36
minor style fix for imports in test_sleep_readers.py
simon-p-2000 Jan 15, 2025
46f0915
reworking mesa sleep reader to skip records not contained in overlap …
simon-p-2000 Jan 16, 2025
1ff2c99
fixing mypy style error
simon-p-2000 Jan 16, 2025
340a891
more mypy error fixes
simon-p-2000 Jan 16, 2025
140d8db
style fix for sleep reader
simon-p-2000 Jan 17, 2025
9bb5e28
resetting changes out of scope of this PR
simon-p-2000 Jan 17, 2025
1bfa060
Merge branch 'cbrnr:main' into activity_counts_model_extension
simon-p-2000 Jan 19, 2025
64fd527
Merge branch 'cbrnr:main' into accelerometer_feature_addition
simon-p-2000 Jan 24, 2025
a7e37c8
initial commit for model extension via activity counts feature
simon-p-2000 Jan 24, 2025
4665034
added documentation for PR
simon-p-2000 Jan 25, 2025
1a8adce
Merge branch 'cbrnr:main' into activity_counts_model_extension
simon-p-2000 Feb 11, 2025
b446ecf
Merge branch 'cbrnr:main' into accelerometer_feature_addition
simon-p-2000 Feb 12, 2025
05cccea
Update CHANGELOG.md
simon-p-2000 Feb 12, 2025
b1596c3
Update docs/datasets.md
simon-p-2000 Feb 12, 2025
fcf6260
adding docstring in test sleep readers, error handling for missing ac…
simon-p-2000 Feb 12, 2025
d61c503
skipping records if line time is not matched
simon-p-2000 Feb 12, 2025
800e9d6
attempting to fix mypy error
simon-p-2000 Feb 12, 2025
6e2ad49
Try to fix mypy error
cbrnr Feb 13, 2025
1d56282
Move line
cbrnr Feb 13, 2025
bd52ffa
reworking loop in read_mesa
simon-p-2000 Feb 13, 2025
ad8c6ec
Merge branch 'activity_counts_model_extension' into accelerometer_fea…
simon-p-2000 Feb 13, 2025
67cb257
reverting previous commit
simon-p-2000 Feb 13, 2025
903c08c
style fixes
simon-p-2000 Feb 13, 2025
d765569
Remove empty line
cbrnr Feb 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions examples/classifiers/wrn_gru_mesa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
set_nsrr_token,
)

set_nsrr_token("your-token-here")
set_nsrr_token("YOUR TOKEN HERE")

TRAIN = True # set to False to skip training and load classifier from disk
TRAIN = False # set to False to skip training and load classifier from disk
cbrnr marked this conversation as resolved.
Show resolved Hide resolved

# silence warnings (which might pop up during feature extraction)
warnings.filterwarnings(
Expand All @@ -27,7 +27,9 @@
if TRAIN:
print("‣ Starting training...")
print("‣‣ Extracting features...")
records = list(read_mesa(offline=False))
records = list(
read_mesa(offline=False, records_pattern="000*", activity_source="actigraphy")
cbrnr marked this conversation as resolved.
Show resolved Hide resolved
)

feature_extraction_params = {
"lookback": 120,
Expand All @@ -38,6 +40,7 @@
"recording_start_time",
"age",
"gender",
"activity_counts",
],
"min_rri": 0.3,
"max_rri": 2,
Expand Down
10 changes: 9 additions & 1 deletion src/sleepecg/feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,13 @@
"LF_HF_ratio",
),
"metadata": ("recording_start_time", "age", "gender", "weight"),
"actigraphy": ("activity_counts", "dummy_feature"),
}
_FEATURE_ID_TO_GROUP = {
id: group
for group, ids in _FEATURE_GROUPS.items()
for id in (ids if isinstance(ids, tuple) else (ids,))
cbrnr marked this conversation as resolved.
Show resolved Hide resolved
}
_FEATURE_ID_TO_GROUP = {id: group for group, ids in _FEATURE_GROUPS.items() for id in ids}

_TIME_DOMAIN_EXPECTED_WARNING_MESSAGES = (
"All-NaN slice encountered",
Expand Down Expand Up @@ -656,6 +661,9 @@ def _extract_features_single(
)
elif feature_group == "metadata":
X.append(_metadata_features(record, num_stages))
elif feature_group == "actigraphy":
if record.activity_counts is not None:
X.append(record.activity_counts.reshape(-1, 1))
features = np.hstack(X)[:, col_indices]

if record.sleep_stages is None or sleep_stage_duration == record.sleep_stage_duration:
Expand Down
149 changes: 146 additions & 3 deletions src/sleepecg/io/sleep_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ class SleepRecord:
Times of heartbeats relative to recording start in seconds, by default `None`.
subject_data : SubjectData, optional
Dataclass containing subject data (such as gender or age), by default `None`.
activity_counts: np.ndarray, optional
Activity counts according to Actiwatch actigraphy, by default `None`.
"""

sleep_stages: np.ndarray | None = None
Expand All @@ -94,12 +96,14 @@ class SleepRecord:
recording_start_time: datetime.time | None = None
heartbeat_times: np.ndarray | None = None
subject_data: SubjectData | None = None
activity_counts: np.ndarray | None = None


class _ParseNsrrXmlResult(NamedTuple):
sleep_stages: np.ndarray
sleep_stage_duration: int
recording_start_time: datetime.time
recording_duration: float


def _parse_nsrr_xml(xml_filepath: Path) -> _ParseNsrrXmlResult:
Expand All @@ -120,6 +124,8 @@ def _parse_nsrr_xml(xml_filepath: Path) -> _ParseNsrrXmlResult:
Duration of each sleep stage in seconds.
recording_start_time : datetime.time
Time at which the recording was started.
recording_duration: float
Duration of the recording in seconds.

"""
STAGE_MAPPING = {
Expand All @@ -139,6 +145,12 @@ def _parse_nsrr_xml(xml_filepath: Path) -> _ParseNsrrXmlResult:
raise RuntimeError(f"EpochLength not found in {xml_filepath}.")
epoch_length = int(epoch_length)

scored_event = root.find(".//ScoredEvent")
if scored_event is not None:
recording_duration = float(scored_event.findtext(".//Duration", ""))
else:
raise RuntimeError(f"Recording duration not found in {xml_filepath}.")

start_time = None
annot_stages = []

Expand All @@ -157,9 +169,7 @@ def _parse_nsrr_xml(xml_filepath: Path) -> _ParseNsrrXmlResult:
raise RuntimeError(f"'Recording Start Time' not found in {xml_filepath}.")

return _ParseNsrrXmlResult(
np.array(annot_stages, dtype=np.int8),
epoch_length,
start_time,
np.array(annot_stages, dtype=np.int8), epoch_length, start_time, recording_duration
)


Expand All @@ -169,6 +179,7 @@ def read_mesa(
offline: bool = False,
keep_edfs: bool = False,
data_dir: str | Path | None = None,
activity_source: str | None = None,
) -> Iterator[SleepRecord]:
"""
Lazily read records from [MESA](https://sleepdata.org/datasets/mesa).
Expand Down Expand Up @@ -197,6 +208,10 @@ def read_mesa(
data_dir : str | pathlib.Path, optional
Directory where all datasets are stored. If `None` (default), the value will be
taken from the configuration.
activity_source : {'actigraphy', 'cached', None}, optional
If `None` (default), actigraphy data will not be downloaded. If `'actigraphy'`,
download actigraphy data from MESA dataset. If `'cached'`, get the cached activity
counts.

Yields
------
Expand All @@ -208,7 +223,10 @@ def read_mesa(
DB_SLUG = "mesa"
ANNOTATION_DIRNAME = "polysomnography/annotations-events-nsrr"
EDF_DIRNAME = "polysomnography/edfs"
ACTIVITY_DIRNAME = "actigraphy"
OVERLAP_DIRNAME = "overlap"
HEARTBEATS_DIRNAME = "preprocessed/heartbeats"
ACTIVITY_COUNTS_DIRNAME = "preprocessed/activity_counts"
RPOINTS_DIRNAME = "polysomnography/annotations-rpoints"

GENDER_MAPPING = {0: Gender.FEMALE, 1: Gender.MALE}
Expand All @@ -220,6 +238,13 @@ def read_mesa(
f"possible options: {heartbeats_source_options}"
)

activity_source_options = {None, "cached", "actigraphy"}
if activity_source not in activity_source_options:
raise ValueError(
f"Invalid value for parameter `activity_source`: {activity_source}, "
f"possible options: {activity_source_options}"
)

if data_dir is None:
data_dir = get_config_value("data_dir")

Expand All @@ -231,6 +256,13 @@ def read_mesa(
for directory in (annotations_dir, edf_dir, heartbeats_dir):
directory.mkdir(parents=True, exist_ok=True)

if activity_source is not None:
activity_dir = db_dir / ACTIVITY_DIRNAME
activity_counts_dir = db_dir / ACTIVITY_COUNTS_DIRNAME
overlap_dir = db_dir / OVERLAP_DIRNAME
for directory in (activity_dir, activity_counts_dir, overlap_dir):
directory.mkdir(parents=True, exist_ok=True)

if not offline:
download_url = _get_nsrr_url(DB_SLUG)

Expand Down Expand Up @@ -271,10 +303,37 @@ def read_mesa(
shallow=True,
)
checksums.update(rpoints_files)

if activity_source is not None:
activity_files = _list_nsrr(
db_slug=DB_SLUG,
subfolder="actigraphy",
pattern=f"mesa-sleep-{records_pattern}.csv",
shallow=True,
)
checksums.update(activity_files)
overlap_filename, overlap_checksum = _list_nsrr(
db_slug="mesa", subfolder="overlap", shallow=True
)[0]
overlap_filepath = db_dir / overlap_filename
_download_nsrr_file(
download_url + overlap_filename,
target_filepath=overlap_filepath,
checksum=overlap_checksum,
)

else:
subject_data_filepath = next((db_dir / "datasets").glob("mesa-sleep-dataset-*.csv"))
xml_paths = annotations_dir.glob(f"mesa-sleep-{records_pattern}-nsrr.xml")
requested_records = sorted([file.stem[:-5] for file in xml_paths])
if activity_source == "actigraphy":
overlap_filename = "mesa-actigraphy-psg-overlap.csv"
overlap_filepath = overlap_dir / overlap_filename
if not overlap_filepath.is_file():
raise RuntimeError(
"Overlap file not found, make sure it is in the correct directory "
"overlap/mesa-actigraphy-psg-overlap.csv."
)

subject_data_array = np.loadtxt(
subject_data_filepath,
Expand All @@ -291,6 +350,15 @@ def read_mesa(
age=age,
)

if activity_source == "actigraphy":
overlap_data = []

with open(overlap_filepath) as csv_file:
reader = csv.reader(csv_file, delimiter=",")
header = next(reader)
for row in reader:
overlap_data.append(dict(zip(header, row)))

for record_id in requested_records:
heartbeats_file = heartbeats_dir / f"{record_id}.npy"
if heartbeats_source == "annotation":
Expand Down Expand Up @@ -350,13 +418,88 @@ def read_mesa(

parsed_xml = _parse_nsrr_xml(xml_filepath)

activity_counts = None
if activity_source is not None:
activity_counts_file = activity_counts_dir / f"{record_id}-activity-counts.npy"
if activity_source == "cached":
if not activity_counts_file.is_file():
print(f"Skipping {record_id} due to missing cached activity counts.")
continue
activity_counts = np.load(activity_counts_file)
else:
activity_filename = ACTIVITY_DIRNAME + f"/{record_id}.csv"
activity_filepath = db_dir / activity_filename
if not offline:
_download_nsrr_file(
download_url + activity_filename,
activity_filepath,
checksums[activity_filename],
)

activity_data = []

with open(activity_filepath) as csv_file:
reader = csv.reader(csv_file, delimiter=",")
header = next(reader)
for row in reader:
activity_data.append(dict(zip(header, row)))

recording_start_time = parsed_xml.recording_start_time
recording_duration = parsed_xml.recording_duration
recording_end_time = datetime.datetime.combine(
datetime.datetime.today(), recording_start_time
) + datetime.timedelta(seconds=recording_duration)

recording_end_time_seconds = recording_end_time.second
rounding_seconds = (
(30 - recording_end_time_seconds % 30)
if recording_end_time_seconds % 30 >= 15
else -(recording_end_time_seconds % 30)
)

recording_end_time = recording_end_time + datetime.timedelta(
seconds=rounding_seconds
)
recording_end_time_str = recording_end_time.strftime("%H:%M:%S").lstrip("0")

mesa_id = activity_data[0].get("mesaid")

start_line = (
int(
next(
row["line"]
for row in overlap_data
if row.get("mesaid") == mesa_id
)
)
+ 1
)
end_line = (
int(
next(
row["line"]
for row in activity_data
if row.get("linetime") == recording_end_time_str
)
)
- 1
)
cbrnr marked this conversation as resolved.
Show resolved Hide resolved

activity_counts = [
row["activity"] for row in activity_data[start_line - 1 : end_line]
]

activity_counts = np.array(activity_counts)
np.save(activity_counts_file, activity_counts)

yield SleepRecord(
sleep_stages=parsed_xml.sleep_stages,
sleep_stage_duration=parsed_xml.sleep_stage_duration,
id=record_id,
recording_start_time=parsed_xml.recording_start_time,
heartbeat_times=heartbeat_times,
subject_data=subject_data[record_id],
activity_counts=activity_counts,
)


Expand Down
Loading