[TEST][REF] Refactor and test functions related to creating participa…

…nts.tsv for AIBL (aramis-lab#1406) * WIP 1611 * WIP 2801 * Finish testing 2811 * Changes as suggested
AliceJoubert · Jan 30, 2025 · fd342a7 · fd342a7
1 parent 424b25d
commit fd342a7
Show file tree

Hide file tree

Showing 2 changed files with 92 additions and 77 deletions.
diff --git a/clinica/iotools/converters/aibl_to_bids/utils/clinical.py b/clinica/iotools/converters/aibl_to_bids/utils/clinical.py
@@ -14,7 +14,7 @@
 
 
 def create_participants_tsv_file(
-    input_path: Path,
+    bids_path: Path,
     clinical_specifications_folder: Path,
     clinical_data_dir: Path,
     delete_non_bids_info: bool = True,
@@ -24,8 +24,8 @@ def create_participants_tsv_file(
 
     Parameters
     ----------
-    input_path : Path
-        The path to the input directory.
+    bids_path : Path
+        The path to the BIDS directory.
 
     clinical_specifications_folder : Path
         The path to the folder containing the clinical specification files.
@@ -38,99 +38,58 @@ def create_participants_tsv_file(
         available in the BIDS dataset.
         Default=True.
     """
-    import glob
-    import os
-    from os import path
+    from clinica.utils.stream import cprint
 
-    import numpy as np
-
-    fields_bids = ["participant_id"]
-    fields_dataset = []
+    study = StudyName.AIBL
     prev_location = ""
-    prev_sheet = ""
-    index_to_drop = []
 
     specifications = _load_specifications(
         clinical_specifications_folder, "participant.tsv"
-    )
-    participant_fields_db = specifications[StudyName.AIBL.value]
-    field_location = specifications[f"{StudyName.AIBL.value} location"]
-    participant_fields_bids = specifications["BIDS CLINICA"]
-
-    # Extract the list of the available fields for the dataset (and the corresponding BIDS version)
-    for i in range(0, len(participant_fields_db)):
-        if not pd.isnull(participant_fields_db[i]):
-            fields_bids.append(participant_fields_bids[i])
-            fields_dataset.append(participant_fields_db[i])
-
-    # Init the dataframe that will be saved in the file participant.tsv
-    participant_df = pd.DataFrame(columns=fields_bids)
-
-    for i in range(0, len(participant_fields_db)):
-        # If a field not empty is found
-        if not pd.isnull(participant_fields_db[i]):
-            # Extract the file location of the field and read the value from the file
-            tmp = field_location[i].split("/")
-            location = tmp[0]
-            # If a sheet is available
-            sheet = tmp[1] if len(tmp) > 1 else ""
-            # Check if the file to open for a certain field it's the same of the previous field
-            if location == prev_location and sheet == prev_sheet:
-                pass
-            else:
-                file_ext = os.path.splitext(location)[1]
-                file_to_read_path = path.join(clinical_data_dir, location)
-
-                if file_ext == ".xlsx":
-                    file_to_read = pd.read_excel(
-                        glob.glob(file_to_read_path)[0], sheet_name=sheet
-                    )
-                elif file_ext == ".csv":
-                    file_to_read = pd.read_csv(glob.glob(file_to_read_path)[0])
-                prev_location = location
-                prev_sheet = sheet
-
-            field_col_values = []
-            # For each field in fields_dataset extract all the column values
-            for j in range(0, len(file_to_read)):
-                # Convert the alternative_id_1 to string if is an integer/float
-                if participant_fields_bids[i] == "alternative_id_1" and (
-                    file_to_read[participant_fields_db[i]].dtype == np.float64
-                    or file_to_read[participant_fields_db[i]].dtype == np.int64
-                ):
-                    if not pd.isnull(file_to_read.at[j, participant_fields_db[i]]):
-                        # value_to_append = str(file_to_read.get_value(j, participant_fields_db[i])).rstrip('.0')
-                        value_to_append = str(
-                            file_to_read.at[j, participant_fields_db[i]]
-                        )
-                    else:
-                        value_to_append = "n/a"
-                else:
-                    value_to_append = file_to_read.at[j, participant_fields_db[i]]
-                field_col_values.append(value_to_append)
-            # Add the extracted column to the participant_df
-            participant_df[participant_fields_bids[i]] = pd.Series(field_col_values)
+    )[[study.value, f"{study.value} location", "BIDS CLINICA"]].dropna()
+
+    participant_df = pd.DataFrame()
+    for _, row in specifications.iterrows():
+        if (location := row[f"{study.value} location"]) != prev_location:
+            file_to_read = _load_metadata_from_pattern(clinical_data_dir, location)
+            prev_location = location
+        participant_df[row["BIDS CLINICA"]] = file_to_read[row[study.value]].astype(str)
 
     # Compute BIDS-compatible participant ID.
-    participant_df["participant_id"] = participant_df["alternative_id_1"].apply(
-        lambda x: bids_id_factory(StudyName.AIBL).from_original_study_id(x)
+    participant_df.insert(
+        0,
+        "participant_id",
+        participant_df["alternative_id_1"].apply(
+            lambda x: bids_id_factory(StudyName.AIBL).from_original_study_id(x)
+        ),
     )
+    participant_df.drop(labels="alternative_id_1", axis=1, inplace=True)
+
     # Keep year-of-birth only.
     participant_df["date_of_birth"] = participant_df["date_of_birth"].str.extract(
         r"/(\d{4}).*"
     )
     # Normalize sex value.
-    participant_df["sex"] = participant_df["sex"].map({1: "M", 2: "F"}).fillna("n/a")
-
+    participant_df["sex"] = participant_df["sex"].map({"1": "M", "2": "F"})
     # Normalize known NA values.
-    participant_df.replace(-4, "n/a", inplace=True)
+    participant_df.fillna("n/a", inplace=True)
+    participant_df.replace("-4", "n/a", inplace=True)
 
     # Delete all the rows of the subjects that are not available in the BIDS dataset
     if delete_non_bids_info:
-        participant_df = participant_df.drop(index_to_drop)
+        subjects_to_keep = [d.name for d in bids_path.glob("sub-*")]
+        participant_df.set_index("participant_id", inplace=True, drop=False)
+        for subject in subjects_to_keep:
+            if subject not in participant_df.index:
+                cprint(
+                    f"No clinical data was found for participant {subject}.",
+                    lvl="warning",
+                )
+                participant_df.loc[subject] = "n/a"
+                participant_df.loc[subject, "participant_id"] = subject
+        participant_df = participant_df.loc[subjects_to_keep]
 
     participant_df.to_csv(
-        input_path / "participants.tsv",
+        bids_path / "participants.tsv",
         sep="\t",
         index=False,
         encoding="utf8",

diff --git a/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py b/test/unittests/iotools/converters/aibl_to_bids/test_aibl_utils.py
@@ -44,6 +44,23 @@ def build_scans_spec(tmp_path: Path) -> Path:
     return tmp_path
 
 
+def build_participants_spec(tmp_path: Path) -> Path:
+    part = pd.DataFrame(
+        {
+            "BIDS CLINICA": ["alternative_id_1", "date_of_birth", "sex", "apoe_gen1"],
+            "AIBL": ["RID", "PTDOB", "PTGENDER", "APGEN1"],
+            "AIBL location": [
+                "aibl_ptdemog_*.csv",
+                "aibl_ptdemog_*.csv",
+                "aibl_ptdemog_*.csv",
+                "aibl_apoeres_*.csv",
+            ],
+        }
+    )
+    part.to_csv(tmp_path / "participant.tsv", index=False, sep="\t")
+    return tmp_path
+
+
 def build_sessions_spec(tmp_path: Path) -> Path:
     spec = pd.DataFrame(
         {
@@ -72,6 +89,15 @@ def build_clinical_data(tmp_path: Path) -> Path:
     data_path = tmp_path / "clinical_data"
     data_path.mkdir()
 
+    apo = pd.DataFrame(
+        {
+            "RID": [1, 2, 100, 100],
+            "VISCODE": ["bl", "bl", "bl", "m12"],
+            "APGEN1": [1, 2, -4, 3],
+        }
+    )
+    apo.to_csv(data_path / "aibl_apoeres_230ct2024.csv", index=False)
+
     neuro = pd.DataFrame(
         {
             "RID": [1, 2, 12, 100, 100, 109, 109],  # %m/%d/%Y
@@ -94,6 +120,7 @@ def build_clinical_data(tmp_path: Path) -> Path:
             "RID": [1, 2, 12, 101],
             "VISCODE": ["bl", "bl", "bl", "bl"],
             "PTDOB": ["/1901", "/1902", "/1912", "/2001"],
+            "PTGENDER": [1, 2, 1, 2],
         }
     )
     ptdemog.to_csv(data_path / "aibl_ptdemog_230ct2024.csv", index=False)
@@ -714,3 +741,32 @@ def test_write_scans_tsv(tmp_path):
     )
 
     assert_frame_equal(result, expected)
+
+
+def test_create_participants_tsv(tmp_path):
+    from clinica.iotools.converters.aibl_to_bids.utils.clinical import (
+        create_participants_tsv_file,
+    )
+
+    bids_path = build_bids_dir(tmp_path)
+    create_participants_tsv_file(
+        bids_path, build_participants_spec(tmp_path), build_clinical_data(tmp_path)
+    )
+
+    result_list = list(bids_path.rglob("*participants.tsv"))
+
+    assert len(result_list) == 1
+
+    result = pd.read_csv(result_list[0], sep="\t", na_filter=False).set_index(
+        "participant_id"
+    )
+
+    expected = pd.DataFrame(
+        {
+            "participant_id": ["sub-AIBL1", "sub-AIBL100", "sub-AIBL109"],
+            "date_of_birth": ["1901", "n/a", "n/a"],
+            "sex": ["M", "n/a", "n/a"],
+            "apoe_gen1": ["1", "n/a", "n/a"],
+        }
+    ).set_index("participant_id")
+    assert_frame_equal(result, expected, check_like=True)