Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion evaluation/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ def run_extraction_pipelines():
"env_variable": "OPENAI_API_KEY",
},
),
(
"contrasts",
{
"extraction_model": "gpt-4o-mini-2024-07-18",
"env_variable": "OPENAI_API_KEY",
},
),
]

# Define paths
Expand Down Expand Up @@ -168,7 +175,39 @@ def load_extracted_results(output_path: Path, dataset_type: str) -> pd.DataFrame
"diagnosis": group.get("diagnosis", ""),
}
all_records.append(record)
else:

elif dataset_type == "contrasts":
# For task extractor, extract contrast information
if "MRIContrasts" in results:
# Combine all tasks into a single record per study
contrasts = results.get("MRIContrasts", [])

if not identifiers.get("pmcid"):
continue

record = {
"pmcid": str(
identifiers.get("pmcid").lstrip("PMC")
),
"comparison": contrasts.get("comparison", ""),
"control_group": group.get("control_group", ""),
"group": group.get("group", ""),
"contrast_statistc": group.get("contrast_statistc", ""),
"atlas": group.get("atlas", ""),
"atlas_n_regions": group.get("atlas_n_regions"),
"roi": group.get("roi", ""),
"coord_system": group.get("coord_system", ""),
"x": group.get("x"),
"y": group.get("y"),
"z": group.get("z"),
"significance": group.get("significance"),
"significance_level": group.get("significance_level", ""),

}

all_records.append(record)

elif dataset_type == "task":
# For task extractor, extract task information
if "fMRITasks" in results:
# Combine all tasks into a single record per study
Expand Down
5 changes: 5 additions & 0 deletions ns_extract/pipelines/contrasts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .model import CoordinatesExtractor

__all__ = [
"CoordinatesExtractor",
]
13 changes: 13 additions & 0 deletions ns_extract/pipelines/contrasts/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Extract brain coordinates from contrasts in scientific papers."""

from .schemas import ContrastBase
from .prompts import base_message
from ns_extract.pipelines.api import APIPromptExtractor


class CoordinatesExtractor(APIPromptExtractor):
"""Task information extraction pipeline using LLM prompts."""

_version = "1.1.0"
_prompt = base_message
_output_schema = ContrastBase
76 changes: 76 additions & 0 deletions ns_extract/pipelines/contrasts/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
base_message = """
You will be provided with a text sample from a scientific journal.
The sample is delimited with triple backticks.

TASK OBJECTIVE:
Extract detailed pinformation about MRI contrasts performed in the study.
The extracted information should be structured according to the provided schema.
If any information specified in the schema is not mentioned in the text,
return `null` for that field.
x
EXTRACTION GUIDELINES:

1. STUDY DOI:
- SEARCH FOR THE STUDY DOI IN THE TEXT
- Note that the contrast can be denoted in various ways, such as "contrast", "comparison" or "significance test", etc.
- If no contrast is mentioned in the text, return `null` for the contrast field

1. CONTRAST IDENTIFICATION:
- Look for the section or table that includees MRI contrasts
- Note that the contrast can be denoted in various ways, such as "contrast", "comparison" or "significance test", etc.
- If no contrast is mentioned in the text, return `null` for the contrast field

1. CONTRAST IDENTIFICATION:
- Look for the section or table that includees MRI contrasts
- Note that the contrast can be denoted in various ways, such as "contrast", "comparison" or "significance test", etc.
- If no contrast is mentioned in the text, return `null` for the contrast field

2. TEST STATISTIC:
- The test statistic can be denoted in various ways, such as "t-statistic", "z-score", "F-statistic", etc.
- Report the test statistic exactly as stated, without inferring or calculating


3. SIGNIFICANCE LEVEL:
- Significance level can be reported as "significant, "*", "**" or as an exact p-value (e.g., "p < 0.05")

P-values:
- If the exact p-value is mentioned, report it in the appropriate field (not as binary significance)
- Report the significance level exactly as stated
- If no significance level is mentioned, return `null` for the significance field

Significance:
- If the contrast is significant, report it as True in the significance field
- If the contrast is not significant, report it as False in the significance field

4. ATLAS / PARCELLATION:
- Report the atlas or parcellation used in the study
- Report the number of regions in the atlas
- Report all atlas-related information exactly as stated, without inferring anything

5. REGION OF INTEREST (ROI):
- If a specific region of interest is mentioned, report it exactly as stated
- Note that the region of interest can be denoted in various ways, such as "ROI", "region", "area", etc.
- ROI can be a specific atlas label or a more general term
- ROI can be denoted as an abbreviation, but not necessarily

6. COORDINATE SYSTEM:
- Report the coordinate system used (e.g., Talairach, MNI, Native)
- If no coordinate system is mentioned, return `null` for the coord_system field

7. BRAIN COORDINATES:
- Extract brain coordinates in the format "x=34, y=-22, z=56".
- Report each coordinated in its designated field (x, y or z)
- Note that the coordinates can be mentioned in the main text or in a table
- If no coordinates are mentioned, return `null` for the x, y, and z fields

IMPORTANT REMINDERS:
- Extract information EXACTLY as stated in the text
- Use technical/medical terms verbatim from the source
- Do not infer or calculate missing values
- Return `null` for any information not explicitly provided

Text sample: ${text}

Return the extracted information in a structured format matching the specified schema,
ensuring each field contains only explicitly stated information from the text.
"""
113 changes: 113 additions & 0 deletions ns_extract/pipelines/contrasts/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from typing import List, Optional
from pydantic import BaseModel, Field

from ..data_structures import NORMALIZE_TEXT, EXPAND_ABBREVIATIONS


class ContrastBase(BaseModel):
comparison: str = Field(
description="Comparison format or direction, "
"such as 'A vs B', 'A > B', etc.",
examples=[
"Childer vs Adults",
"Placebo > Treatment",
"Group 1 < Group 2B",
"Healthy - Patients",
],
json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True},
)
control_group: str = Field(
description="Name of the control group, if applicable. "
"Does not necessarily include the word 'control'.",
examples=[
"Healthy Controls",
"Controls",
"Neurotypical Controls",
"Placebo",
"No treatment",
],
json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True},
)
group: str = Field(
description="Any group compared against the control group. ",
examples=[
"Patients with Depression",
"Treated Subjects",
"After Treatment",
],
json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True},
)
contrast_statistc: Optional[str] = Field(
description="Statistic used for this contrast, such as 't-statistic', 'z-score', etc.",
examples=["t-statistic", "z-score", "F-statistic", "correlation"],
json_schema_extra={NORMALIZE_TEXT: True},
)
atlas: Optional[str] = Field(
description="Atlas used for this contrast, if mentioned.",
examples=["Harvard-Oxford", "AAL", "Schaefer"],
json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True},
)
atlas_n_regions: Optional[int] = Field(
description="Number of regions in the atlas, if mentioned.",
examples=[64, 100, 200, 400],
json_schema_extra={NORMALIZE_TEXT: True},

)
roi: Optional[str] = Field(
description="Region of interest if mentioned.",
examples=[
"Left DLPFC",
"Temporal gyrus",
"Angular gyrus",
"Calcarine fissure",
"IPL",
"SomMot_9",
],
json_schema_extra={NORMALIZE_TEXT: True, EXPAND_ABBREVIATIONS: True},
)

coord_system: Optional[str] = Field(
description="Coordinate system.",
examples=[
"Talairach",
"MNI",
"Native",
],
json_schema_extra={NORMALIZE_TEXT: True},
)
x: Optional[int] = Field(
default=None,
description="Brain coordinate on the x-axis mentioned in a table or text.",
examples=[
"x=34",
"x=-22",
],
)
y: Optional[int] = Field(
default=None,
description="Brain coordinate on the y-axis mentioned in a table or text.",
examples=[
"y=14",
"y=-52",
],
)
z: Optional[int] = Field(
default=None,
description="Brain coordinate on the z-axis mentioned in a table or text.",
examples=[
"z=34",
"z=-22",
],
)
significance: Optional[bool] = Field(
description="Is the contrast significant? The response is binary (True or False). "
"Usually denoted with asterisks, bold font, or p-value.",
examples=["p < 0.05", "p < 0.001", "p = 0.001", "*", "**", "***"],
json_schema_extra={NORMALIZE_TEXT: True},
)

significance_level: Optional[bool] = Field(
description="The p-value or alpha significance threshold. E.g., 'p < 0.05', 'alpha = 0.025', etc.",
examples=["p < 0.05", "p < 0.001", "p = 0.001", "*", "**", "***"],
json_schema_extra={NORMALIZE_TEXT: True},
)