diff --git a/evaluation/run_evaluation.py b/evaluation/run_evaluation.py index bb83a0c..22737d8 100644 --- a/evaluation/run_evaluation.py +++ b/evaluation/run_evaluation.py @@ -82,6 +82,13 @@ def run_extraction_pipelines(): "env_variable": "OPENAI_API_KEY", }, ), + ( + "contrasts", + { + "extraction_model": "gpt-4o-mini-2024-07-18", + "env_variable": "OPENAI_API_KEY", + }, + ), ] # Define paths @@ -168,7 +175,39 @@ def load_extracted_results(output_path: Path, dataset_type: str) -> pd.DataFrame "diagnosis": group.get("diagnosis", ""), } all_records.append(record) - else: + + elif dataset_type == "contrasts": + # For task extractor, extract contrast information + if "MRIContrasts" in results: + # Combine all tasks into a single record per study + contrasts = results.get("MRIContrasts", []) + + if not identifiers.get("pmcid"): + continue + + record = { + "pmcid": str( + identifiers.get("pmcid").lstrip("PMC") + ), + "comparison": contrasts.get("comparison", ""), + "control_group": group.get("control_group", ""), + "group": group.get("group", ""), + "contrast_statistc": group.get("contrast_statistc", ""), + "atlas": group.get("atlas", ""), + "atlas_n_regions": group.get("atlas_n_regions"), + "roi": group.get("roi", ""), + "coord_system": group.get("coord_system", ""), + "x": group.get("x"), + "y": group.get("y"), + "z": group.get("z"), + "significance": group.get("significance"), + "significance_level": group.get("significance_level", ""), + + } + + all_records.append(record) + + elif dataset_type == "task": # For task extractor, extract task information if "fMRITasks" in results: # Combine all tasks into a single record per study diff --git a/ns_extract/pipelines/contrasts/__init__.py b/ns_extract/pipelines/contrasts/__init__.py new file mode 100644 index 0000000..4e92421 --- /dev/null +++ b/ns_extract/pipelines/contrasts/__init__.py @@ -0,0 +1,5 @@ +from .model import CoordinatesExtractor + +__all__ = [ + "CoordinatesExtractor", +] diff --git a/ns_extract/pipelines/contrasts/model.py b/ns_extract/pipelines/contrasts/model.py new file mode 100644 index 0000000..1b4d282 --- /dev/null +++ b/ns_extract/pipelines/contrasts/model.py @@ -0,0 +1,13 @@ +"""Extract brain coordinates from contrasts in scientific papers.""" + +from .schemas import ContrastBase +from .prompts import base_message +from ns_extract.pipelines.api import APIPromptExtractor + + +class CoordinatesExtractor(APIPromptExtractor): + """Task information extraction pipeline using LLM prompts.""" + + _version = "1.1.0" + _prompt = base_message + _output_schema = ContrastBase \ No newline at end of file diff --git a/ns_extract/pipelines/contrasts/prompts.py b/ns_extract/pipelines/contrasts/prompts.py new file mode 100644 index 0000000..8c67445 --- /dev/null +++ b/ns_extract/pipelines/contrasts/prompts.py @@ -0,0 +1,76 @@ +base_message = """ +You will be provided with a text sample from a scientific journal. +The sample is delimited with triple backticks. + +TASK OBJECTIVE: +Extract detailed pinformation about MRI contrasts performed in the study. +The extracted information should be structured according to the provided schema. +If any information specified in the schema is not mentioned in the text, +return `null` for that field. +x +EXTRACTION GUIDELINES: + +1. STUDY DOI: + - SEARCH FOR THE STUDY DOI IN THE TEXT + - Note that the contrast can be denoted in various ways, such as "contrast", "comparison" or "significance test", etc. + - If no contrast is mentioned in the text, return `null` for the contrast field + +1. CONTRAST IDENTIFICATION: + - Look for the section or table that includees MRI contrasts + - Note that the contrast can be denoted in various ways, such as "contrast", "comparison" or "significance test", etc. + - If no contrast is mentioned in the text, return `null` for the contrast field + +1. CONTRAST IDENTIFICATION: + - Look for the section or table that includees MRI contrasts + - Note that the contrast can be denoted in various ways, such as "contrast", "comparison" or "significance test", etc. + - If no contrast is mentioned in the text, return `null` for the contrast field + +2. TEST STATISTIC: + - The test statistic can be denoted in various ways, such as "t-statistic", "z-score", "F-statistic", etc. + - Report the test statistic exactly as stated, without inferring or calculating + + +3. SIGNIFICANCE LEVEL: + - Significance level can be reported as "significant, "*", "**" or as an exact p-value (e.g., "p < 0.05") + + P-values: + - If the exact p-value is mentioned, report it in the appropriate field (not as binary significance) + - Report the significance level exactly as stated + - If no significance level is mentioned, return `null` for the significance field + + Significance: + - If the contrast is significant, report it as True in the significance field + - If the contrast is not significant, report it as False in the significance field + +4. ATLAS / PARCELLATION: + - Report the atlas or parcellation used in the study + - Report the number of regions in the atlas + - Report all atlas-related information exactly as stated, without inferring anything + +5. REGION OF INTEREST (ROI): + - If a specific region of interest is mentioned, report it exactly as stated + - Note that the region of interest can be denoted in various ways, such as "ROI", "region", "area", etc. + - ROI can be a specific atlas label or a more general term + - ROI can be denoted as an abbreviation, but not necessarily + +6. COORDINATE SYSTEM: + - Report the coordinate system used (e.g., Talairach, MNI, Native) + - If no coordinate system is mentioned, return `null` for the coord_system field + +7. BRAIN COORDINATES: + - Extract brain coordinates in the format "x=34, y=-22, z=56". + - Report each coordinated in its designated field (x, y or z) + - Note that the coordinates can be mentioned in the main text or in a table + - If no coordinates are mentioned, return `null` for the x, y, and z fields + +IMPORTANT REMINDERS: +- Extract information EXACTLY as stated in the text +- Use technical/medical terms verbatim from the source +- Do not infer or calculate missing values +- Return `null` for any information not explicitly provided + +Text sample: ${text} + +Return the extracted information in a structured format matching the specified schema, +ensuring each field contains only explicitly stated information from the text. +""" diff --git a/ns_extract/pipelines/contrasts/schemas.py b/ns_extract/pipelines/contrasts/schemas.py new file mode 100644 index 0000000..1dcd74f --- /dev/null +++ b/ns_extract/pipelines/contrasts/schemas.py @@ -0,0 +1,113 @@ +from typing import List, Optional +from pydantic import BaseModel, Field + +from ..data_structures import NORMALIZE_TEXT, EXPAND_ABBREVIATIONS + + +class ContrastBase(BaseModel): + comparison: str = Field( + description="Comparison format or direction, " + "such as 'A vs B', 'A > B', etc.", + examples=[ + "Childer vs Adults", + "Placebo > Treatment", + "Group 1 < Group 2B", + "Healthy - Patients", + ], + json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True}, + ) + control_group: str = Field( + description="Name of the control group, if applicable. " + "Does not necessarily include the word 'control'.", + examples=[ + "Healthy Controls", + "Controls", + "Neurotypical Controls", + "Placebo", + "No treatment", + ], + json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True}, + ) + group: str = Field( + description="Any group compared against the control group. ", + examples=[ + "Patients with Depression", + "Treated Subjects", + "After Treatment", + ], + json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True}, + ) + contrast_statistc: Optional[str] = Field( + description="Statistic used for this contrast, such as 't-statistic', 'z-score', etc.", + examples=["t-statistic", "z-score", "F-statistic", "correlation"], + json_schema_extra={NORMALIZE_TEXT: True}, + ) + atlas: Optional[str] = Field( + description="Atlas used for this contrast, if mentioned.", + examples=["Harvard-Oxford", "AAL", "Schaefer"], + json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True}, + ) + atlas_n_regions: Optional[int] = Field( + description="Number of regions in the atlas, if mentioned.", + examples=[64, 100, 200, 400], + json_schema_extra={NORMALIZE_TEXT: True}, + + ) + roi: Optional[str] = Field( + description="Region of interest if mentioned.", + examples=[ + "Left DLPFC", + "Temporal gyrus", + "Angular gyrus", + "Calcarine fissure", + "IPL", + "SomMot_9", + ], + json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True}, + ) + + coord_system: Optional[str] = Field( + description="Coordinate system.", + examples=[ + "Talairach", + "MNI", + "Native", + ], + json_schema_extra={NORMALIZE_TEXT: True}, + ) + x: Optional[int] = Field( + default=None, + description="Brain coordinate on the x-axis mentioned in a table or text.", + examples=[ + "x=34", + "x=-22", + ], + ) + y: Optional[int] = Field( + default=None, + description="Brain coordinate on the y-axis mentioned in a table or text.", + examples=[ + "y=14", + "y=-52", + ], + ) + z: Optional[int] = Field( + default=None, + description="Brain coordinate on the z-axis mentioned in a table or text.", + examples=[ + "z=34", + "z=-22", + ], + ) + significance: Optional[bool] = Field( + description="Is the contrast significant? The response is binary (True or False). " + "Usually denoted with asterisks, bold font, or p-value.", + examples=["p < 0.05", "p < 0.001", "p = 0.001", "*", "**", "***"], + json_schema_extra={NORMALIZE_TEXT: True}, + ) + + significance_level: Optional[bool] = Field( + description="The p-value or alpha significance threshold. E.g., 'p < 0.05', 'alpha = 0.025', etc.", + examples=["p < 0.05", "p < 0.001", "p = 0.001", "*", "**", "***"], + json_schema_extra={NORMALIZE_TEXT: True}, + )