neurostuff · victoris93 · Jun 23, 2025 · Jun 23, 2025
diff --git a/evaluation/run_evaluation.py b/evaluation/run_evaluation.py
@@ -82,6 +82,13 @@ def run_extraction_pipelines():
                 "env_variable": "OPENAI_API_KEY",
             },
         ),
+        (
+            "contrasts",
+            {
+                "extraction_model": "gpt-4o-mini-2024-07-18",
+                "env_variable": "OPENAI_API_KEY",
+            },
+        ),
     ]
 
     # Define paths
@@ -168,7 +175,39 @@ def load_extracted_results(output_path: Path, dataset_type: str) -> pd.DataFrame
                     "diagnosis": group.get("diagnosis", ""),
                 }
                 all_records.append(record)
-        else:
+
+        elif dataset_type == "contrasts":
+                    # For task extractor, extract contrast information
+                    if "MRIContrasts" in results:
+                        # Combine all tasks into a single record per study
+                        contrasts = results.get("MRIContrasts", [])
+
+                        if not identifiers.get("pmcid"):
+                            continue
+
+                    record = {
+                        "pmcid": str(
+                            identifiers.get("pmcid").lstrip("PMC")
+                        ),
+                        "comparison": contrasts.get("comparison", ""),
+                        "control_group": group.get("control_group", ""),
+                        "group": group.get("group", ""),
+                        "contrast_statistc": group.get("contrast_statistc", ""),
+                        "atlas": group.get("atlas", ""),
+                        "atlas_n_regions": group.get("atlas_n_regions"),
+                        "roi": group.get("roi", ""),
+                        "coord_system": group.get("coord_system", ""),
+                        "x": group.get("x"),
+                        "y": group.get("y"),
+                        "z": group.get("z"),
+                        "significance": group.get("significance"),
+                        "significance_level": group.get("significance_level", ""),
+
+                    }
+
+                    all_records.append(record)
+
+        elif dataset_type == "task":
             # For task extractor, extract task information
             if "fMRITasks" in results:
                 # Combine all tasks into a single record per study

diff --git a/ns_extract/pipelines/contrasts/__init__.py b/ns_extract/pipelines/contrasts/__init__.py
@@ -0,0 +1,5 @@
+from .model import CoordinatesExtractor
+
+__all__ = [
+    "CoordinatesExtractor",
+]
diff --git a/ns_extract/pipelines/contrasts/model.py b/ns_extract/pipelines/contrasts/model.py
@@ -0,0 +1,13 @@
+"""Extract brain coordinates from contrasts in scientific papers."""
+
+from .schemas import ContrastBase
+from .prompts import base_message
+from ns_extract.pipelines.api import APIPromptExtractor
+
+
+class CoordinatesExtractor(APIPromptExtractor):
+    """Task information extraction pipeline using LLM prompts."""
+
+    _version = "1.1.0"
+    _prompt = base_message
+    _output_schema = ContrastBase
diff --git a/ns_extract/pipelines/contrasts/prompts.py b/ns_extract/pipelines/contrasts/prompts.py
@@ -0,0 +1,76 @@
+base_message = """
+You will be provided with a text sample from a scientific journal.
+The sample is delimited with triple backticks.
+
+TASK OBJECTIVE:
+Extract detailed pinformation about MRI contrasts performed in the study.
+The extracted information should be structured according to the provided schema.
+If any information specified in the schema is not mentioned in the text, 
+return `null` for that field.
+x
+EXTRACTION GUIDELINES:
+
+1. STUDY DOI:
+   - SEARCH FOR THE STUDY DOI IN THE TEXT
+   - Note that the contrast can be denoted in various ways, such as "contrast", "comparison" or "significance test", etc.
+   - If no contrast is mentioned in the text, return `null` for the contrast field
+
+1. CONTRAST IDENTIFICATION:
+   - Look for the section or table that includees MRI contrasts
+   - Note that the contrast can be denoted in various ways, such as "contrast", "comparison" or "significance test", etc.
+   - If no contrast is mentioned in the text, return `null` for the contrast field
+
+1. CONTRAST IDENTIFICATION:
+   - Look for the section or table that includees MRI contrasts
+   - Note that the contrast can be denoted in various ways, such as "contrast", "comparison" or "significance test", etc.
+   - If no contrast is mentioned in the text, return `null` for the contrast field
+
+2. TEST STATISTIC:
+   - The test statistic can be denoted in various ways, such as "t-statistic", "z-score", "F-statistic", etc.
+   - Report the test statistic exactly as stated, without inferring or calculating
+
+
+3. SIGNIFICANCE LEVEL:
+    - Significance level can be reported as "significant, "*", "**" or as an exact p-value (e.g., "p < 0.05")
+
+    P-values:
+   - If the exact p-value is mentioned, report it in the appropriate field (not as binary significance)
+   - Report the significance level exactly as stated
+   - If no significance level is mentioned, return `null` for the significance field
+
+   Significance:
+    - If the contrast is significant, report it as True in the significance field
+    - If the contrast is not significant, report it as False in the significance field
+
+4. ATLAS / PARCELLATION:
+   - Report the atlas or parcellation used in the study
+   - Report the number of regions in the atlas
+   - Report all atlas-related information exactly as stated, without inferring anything
+
+5. REGION OF INTEREST (ROI):
+    - If a specific region of interest is mentioned, report it exactly as stated
+    - Note that the region of interest can be denoted in various ways, such as "ROI", "region", "area", etc.
+    - ROI can be a specific atlas label or a more general term
+    - ROI can be denoted as an abbreviation, but not necessarily
+
+6. COORDINATE SYSTEM:
+    - Report the coordinate system used (e.g., Talairach, MNI, Native)
+    - If no coordinate system is mentioned, return `null` for the coord_system field
+
+7. BRAIN COORDINATES:
+    - Extract brain coordinates in the format "x=34, y=-22, z=56".
+    - Report each coordinated in its designated field (x, y or z)
+    - Note that the coordinates can be mentioned in the main text or in a table
+    - If no coordinates are mentioned, return `null` for the x, y, and z fields
+
+IMPORTANT REMINDERS:
+- Extract information EXACTLY as stated in the text
+- Use technical/medical terms verbatim from the source
+- Do not infer or calculate missing values
+- Return `null` for any information not explicitly provided
+
+Text sample: ${text}
+
+Return the extracted information in a structured format matching the specified schema,
+ensuring each field contains only explicitly stated information from the text.
+"""
diff --git a/ns_extract/pipelines/contrasts/schemas.py b/ns_extract/pipelines/contrasts/schemas.py
@@ -0,0 +1,113 @@
+from typing import List, Optional
+from pydantic import BaseModel, Field
+
+from ..data_structures import NORMALIZE_TEXT, EXPAND_ABBREVIATIONS
+
+
+class ContrastBase(BaseModel):
+    comparison: str = Field(
+        description="Comparison format or direction, "
+        "such as 'A vs B', 'A > B', etc.",
+        examples=[
+            "Childer vs Adults",
+            "Placebo > Treatment",
+            "Group 1 < Group 2B",
+            "Healthy - Patients",
+        ],
+        json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True},
+    )
+    control_group: str = Field(
+        description="Name of the control group, if applicable. "
+        "Does not necessarily include the word 'control'.",
+        examples=[
+            "Healthy Controls",
+            "Controls",
+            "Neurotypical Controls",
+            "Placebo",
+            "No treatment",
+        ],
+        json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True},
+    )
+    group: str = Field(
+        description="Any group compared against the control group. ",
+        examples=[
+            "Patients with Depression",
+            "Treated Subjects",
+            "After Treatment",
+        ],
+        json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True},
+    )
+    contrast_statistc: Optional[str] = Field(
+        description="Statistic used for this contrast, such as 't-statistic', 'z-score', etc.",
+        examples=["t-statistic", "z-score", "F-statistic", "correlation"],
+        json_schema_extra={NORMALIZE_TEXT: True},
+    )
+    atlas: Optional[str] = Field(
+        description="Atlas used for this contrast, if mentioned.",
+        examples=["Harvard-Oxford", "AAL", "Schaefer"],
+        json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True},
+    )
+    atlas_n_regions: Optional[int] = Field(
+        description="Number of regions in the atlas, if mentioned.",
+        examples=[64, 100, 200, 400],
+        json_schema_extra={NORMALIZE_TEXT: True},
+
+    )
+    roi: Optional[str] = Field(
+        description="Region of interest if mentioned.",
+        examples=[
+            "Left DLPFC",
+            "Temporal gyrus",
+            "Angular gyrus",
+            "Calcarine fissure",
+            "IPL",
+            "SomMot_9",
+        ],
+        json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True},
+    )
+
+    coord_system: Optional[str] = Field(
+        description="Coordinate system.",
+        examples=[
+            "Talairach",
+            "MNI",
+            "Native",
+        ],
+        json_schema_extra={NORMALIZE_TEXT: True},
+    )
+    x: Optional[int] = Field(
+        default=None,
+        description="Brain coordinate on the x-axis mentioned in a table or text.",
+        examples=[
+            "x=34",
+            "x=-22",
+        ],
+    )
+    y: Optional[int] = Field(
+        default=None,
+        description="Brain coordinate on the y-axis mentioned in a table or text.",
+        examples=[
+            "y=14",
+            "y=-52",
+        ],
+    )
+    z: Optional[int] = Field(
+        default=None,
+        description="Brain coordinate on the z-axis mentioned in a table or text.",
+        examples=[
+            "z=34",
+            "z=-22",
+        ],
+    )
+    significance: Optional[bool] = Field(
+        description="Is the contrast significant? The response is binary (True or False). "
+        "Usually denoted with asterisks, bold font, or p-value.",
+        examples=["p < 0.05", "p < 0.001", "p = 0.001", "*", "**", "***"],
+        json_schema_extra={NORMALIZE_TEXT: True},
+    )
+
+    significance_level: Optional[bool] = Field(
+        description="The p-value or alpha significance threshold. E.g., 'p < 0.05', 'alpha = 0.025', etc.",
+        examples=["p < 0.05", "p < 0.001", "p = 0.001", "*", "**", "***"],
+        json_schema_extra={NORMALIZE_TEXT: True},
+    )