Skip to content

Commit

Permalink
[TEST][REF] Refactor and test functions related to creating participa…
Browse files Browse the repository at this point in the history
…nts.tsv for AIBL (aramis-lab#1406)

* WIP 1611

* WIP 2801

* Finish testing 2811

* Changes as suggested
  • Loading branch information
AliceJoubert authored Jan 30, 2025
1 parent 424b25d commit fd342a7
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 77 deletions.
113 changes: 36 additions & 77 deletions clinica/iotools/converters/aibl_to_bids/utils/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


def create_participants_tsv_file(
input_path: Path,
bids_path: Path,
clinical_specifications_folder: Path,
clinical_data_dir: Path,
delete_non_bids_info: bool = True,
Expand All @@ -24,8 +24,8 @@ def create_participants_tsv_file(
Parameters
----------
input_path : Path
The path to the input directory.
bids_path : Path
The path to the BIDS directory.
clinical_specifications_folder : Path
The path to the folder containing the clinical specification files.
Expand All @@ -38,99 +38,58 @@ def create_participants_tsv_file(
available in the BIDS dataset.
Default=True.
"""
import glob
import os
from os import path
from clinica.utils.stream import cprint

import numpy as np

fields_bids = ["participant_id"]
fields_dataset = []
study = StudyName.AIBL
prev_location = ""
prev_sheet = ""
index_to_drop = []

specifications = _load_specifications(
clinical_specifications_folder, "participant.tsv"
)
participant_fields_db = specifications[StudyName.AIBL.value]
field_location = specifications[f"{StudyName.AIBL.value} location"]
participant_fields_bids = specifications["BIDS CLINICA"]

# Extract the list of the available fields for the dataset (and the corresponding BIDS version)
for i in range(0, len(participant_fields_db)):
if not pd.isnull(participant_fields_db[i]):
fields_bids.append(participant_fields_bids[i])
fields_dataset.append(participant_fields_db[i])

# Init the dataframe that will be saved in the file participant.tsv
participant_df = pd.DataFrame(columns=fields_bids)

for i in range(0, len(participant_fields_db)):
# If a field not empty is found
if not pd.isnull(participant_fields_db[i]):
# Extract the file location of the field and read the value from the file
tmp = field_location[i].split("/")
location = tmp[0]
# If a sheet is available
sheet = tmp[1] if len(tmp) > 1 else ""
# Check if the file to open for a certain field it's the same of the previous field
if location == prev_location and sheet == prev_sheet:
pass
else:
file_ext = os.path.splitext(location)[1]
file_to_read_path = path.join(clinical_data_dir, location)

if file_ext == ".xlsx":
file_to_read = pd.read_excel(
glob.glob(file_to_read_path)[0], sheet_name=sheet
)
elif file_ext == ".csv":
file_to_read = pd.read_csv(glob.glob(file_to_read_path)[0])
prev_location = location
prev_sheet = sheet

field_col_values = []
# For each field in fields_dataset extract all the column values
for j in range(0, len(file_to_read)):
# Convert the alternative_id_1 to string if is an integer/float
if participant_fields_bids[i] == "alternative_id_1" and (
file_to_read[participant_fields_db[i]].dtype == np.float64
or file_to_read[participant_fields_db[i]].dtype == np.int64
):
if not pd.isnull(file_to_read.at[j, participant_fields_db[i]]):
# value_to_append = str(file_to_read.get_value(j, participant_fields_db[i])).rstrip('.0')
value_to_append = str(
file_to_read.at[j, participant_fields_db[i]]
)
else:
value_to_append = "n/a"
else:
value_to_append = file_to_read.at[j, participant_fields_db[i]]
field_col_values.append(value_to_append)
# Add the extracted column to the participant_df
participant_df[participant_fields_bids[i]] = pd.Series(field_col_values)
)[[study.value, f"{study.value} location", "BIDS CLINICA"]].dropna()

participant_df = pd.DataFrame()
for _, row in specifications.iterrows():
if (location := row[f"{study.value} location"]) != prev_location:
file_to_read = _load_metadata_from_pattern(clinical_data_dir, location)
prev_location = location
participant_df[row["BIDS CLINICA"]] = file_to_read[row[study.value]].astype(str)

# Compute BIDS-compatible participant ID.
participant_df["participant_id"] = participant_df["alternative_id_1"].apply(
lambda x: bids_id_factory(StudyName.AIBL).from_original_study_id(x)
participant_df.insert(
0,
"participant_id",
participant_df["alternative_id_1"].apply(
lambda x: bids_id_factory(StudyName.AIBL).from_original_study_id(x)
),
)
participant_df.drop(labels="alternative_id_1", axis=1, inplace=True)

# Keep year-of-birth only.
participant_df["date_of_birth"] = participant_df["date_of_birth"].str.extract(
r"/(\d{4}).*"
)
# Normalize sex value.
participant_df["sex"] = participant_df["sex"].map({1: "M", 2: "F"}).fillna("n/a")

participant_df["sex"] = participant_df["sex"].map({"1": "M", "2": "F"})
# Normalize known NA values.
participant_df.replace(-4, "n/a", inplace=True)
participant_df.fillna("n/a", inplace=True)
participant_df.replace("-4", "n/a", inplace=True)

# Delete all the rows of the subjects that are not available in the BIDS dataset
if delete_non_bids_info:
participant_df = participant_df.drop(index_to_drop)
subjects_to_keep = [d.name for d in bids_path.glob("sub-*")]
participant_df.set_index("participant_id", inplace=True, drop=False)
for subject in subjects_to_keep:
if subject not in participant_df.index:
cprint(
f"No clinical data was found for participant {subject}.",
lvl="warning",
)
participant_df.loc[subject] = "n/a"
participant_df.loc[subject, "participant_id"] = subject
participant_df = participant_df.loc[subjects_to_keep]

participant_df.to_csv(
input_path / "participants.tsv",
bids_path / "participants.tsv",
sep="\t",
index=False,
encoding="utf8",
Expand Down
56 changes: 56 additions & 0 deletions test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,23 @@ def build_scans_spec(tmp_path: Path) -> Path:
return tmp_path


def build_participants_spec(tmp_path: Path) -> Path:
part = pd.DataFrame(
{
"BIDS CLINICA": ["alternative_id_1", "date_of_birth", "sex", "apoe_gen1"],
"AIBL": ["RID", "PTDOB", "PTGENDER", "APGEN1"],
"AIBL location": [
"aibl_ptdemog_*.csv",
"aibl_ptdemog_*.csv",
"aibl_ptdemog_*.csv",
"aibl_apoeres_*.csv",
],
}
)
part.to_csv(tmp_path / "participant.tsv", index=False, sep="\t")
return tmp_path


def build_sessions_spec(tmp_path: Path) -> Path:
spec = pd.DataFrame(
{
Expand Down Expand Up @@ -72,6 +89,15 @@ def build_clinical_data(tmp_path: Path) -> Path:
data_path = tmp_path / "clinical_data"
data_path.mkdir()

apo = pd.DataFrame(
{
"RID": [1, 2, 100, 100],
"VISCODE": ["bl", "bl", "bl", "m12"],
"APGEN1": [1, 2, -4, 3],
}
)
apo.to_csv(data_path / "aibl_apoeres_230ct2024.csv", index=False)

neuro = pd.DataFrame(
{
"RID": [1, 2, 12, 100, 100, 109, 109], # %m/%d/%Y
Expand All @@ -94,6 +120,7 @@ def build_clinical_data(tmp_path: Path) -> Path:
"RID": [1, 2, 12, 101],
"VISCODE": ["bl", "bl", "bl", "bl"],
"PTDOB": ["/1901", "/1902", "/1912", "/2001"],
"PTGENDER": [1, 2, 1, 2],
}
)
ptdemog.to_csv(data_path / "aibl_ptdemog_230ct2024.csv", index=False)
Expand Down Expand Up @@ -714,3 +741,32 @@ def test_write_scans_tsv(tmp_path):
)

assert_frame_equal(result, expected)


def test_create_participants_tsv(tmp_path):
from clinica.iotools.converters.aibl_to_bids.utils.clinical import (
create_participants_tsv_file,
)

bids_path = build_bids_dir(tmp_path)
create_participants_tsv_file(
bids_path, build_participants_spec(tmp_path), build_clinical_data(tmp_path)
)

result_list = list(bids_path.rglob("*participants.tsv"))

assert len(result_list) == 1

result = pd.read_csv(result_list[0], sep="\t", na_filter=False).set_index(
"participant_id"
)

expected = pd.DataFrame(
{
"participant_id": ["sub-AIBL1", "sub-AIBL100", "sub-AIBL109"],
"date_of_birth": ["1901", "n/a", "n/a"],
"sex": ["M", "n/a", "n/a"],
"apoe_gen1": ["1", "n/a", "n/a"],
}
).set_index("participant_id")
assert_frame_equal(result, expected, check_like=True)

0 comments on commit fd342a7

Please sign in to comment.