Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,34 @@
# med-lm-data
Processing post-training datasets for medicine
# Quick Start
use:
- uv sync
- uv run -m src.main


### What it does
- Downloads the HF dataset `RJT1990/GeneralThoughtArchive` (first 100 rows by default).
- Loads the NER model `Ihor/gliner-biomed-large-v1.0`.
- Scans each question (first 150 characters), detects biomedical entities across predefined labels.
- Classifies a question as biomedical if:
- it contains the token "bio" (case-insensitive), or
- the model predicts at least one entity above the threshold (default 0.80).
- Writes matched records to `Final_Bio_terms.txt`.
- Reconstructs the matching rows from the original HF dataset and saves them to `hf_med_filtered_data/` via `datasets.save_to_disk`.


## How it works

- Entry point: `src/main.py`
- Seeds and picks device (`cuda` if available).
- Iterates over `dataset['question']`, truncates to 150 chars, runs `model.predict_entities(...)`.
- Keeps only `{"label", "score"}` per entity for logging .
- Aggregates matched questions into `bio_term` and writes one JSON-like dict per line to `Final_Bio_terms.txt`.
- Calls `filtered_med_data(...)` to map truncated questions back to full HF rows and saves to `hf_med_filtered_data/`.

- NER model: `src/ner_model.py`
- Loads `Ihor/gliner-biomed-large-v1.0` to CPU/GPU and exposes `model` and `ner_labels`.

- Dataset: `src/hf_datasets.py`
- Loads split `train` from `RJT1990/GeneralThoughtArchive`

- Final- HF : `src/filtered_data.py`
- Matches records by truncated question text to reconstruct the original rows.
14 changes: 14 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[project]
name = "medllm-data"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"datasets>=4.1.1",
"gliner>=0.2.22",
"numpy>=2.3.3",
"pandas>=2.3.3",
"torch>=2.8.0",
"tqdm>=4.67.1",
]
Empty file added src/__init__.py
Empty file.
Binary file added src/__pycache__/__init__.cpython-311.pyc
Binary file not shown.
Binary file added src/__pycache__/filtered_data.cpython-311.pyc
Binary file not shown.
Binary file added src/__pycache__/hf_datasets.cpython-311.pyc
Binary file not shown.
Binary file added src/__pycache__/main.cpython-311.pyc
Binary file not shown.
Binary file added src/__pycache__/ner_model.cpython-311.pyc
Binary file not shown.
16 changes: 16 additions & 0 deletions src/filtered_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import ast
from datasets import Dataset

def filtered_med_data(file_path, dataset):
matches = []
with open(file_path, "r") as f:
for line in f:
if line.strip():
record = ast.literal_eval(line.strip())
file_question = record.get("question", "")[:150]

for row in dataset:
if row["question"][:150] == file_question:
matches.append(row)

return Dataset.from_list(matches)
3 changes: 3 additions & 0 deletions src/hf_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from datasets import load_dataset
dataset_name = 'RJT1990/GeneralThoughtArchive'
dataset = load_dataset(dataset_name, split='train')
76 changes: 76 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from src.ner_model import model, ner_labels
from src.hf_datasets import dataset
from src.filtered_data import filtered_med_data

import os
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
import numpy as np
import random
from gliner import GLiNER
import torch

torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
random.seed(42)
np.random.seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on:", device)


# Track terms
non_bio_term = []
bio_term = []

for q in tqdm(dataset['question'], total=len(dataset)):
text = q[:150] # truncate early
entities = model.predict_entities(text, ner_labels, threshold=0.80)

# Only keep label + score
filtered_entities = [
{"label": ent["label"], "score": ent["score"]}
for ent in entities
]

record = {
"question": text,
"entities": filtered_entities
}

if "bio" in text.lower():
bio_term.append(record)
print("\n")
print(f"Found a bio term: {text}")
print(f"Entities: {filtered_entities}")
print("\n")
continue

if entities == []:
non_bio_term.append(record)
else:
bio_term.append(record)
print("\n")
print(f"Found a bio term: {text}")
print(f"Entities: {filtered_entities}")
print("\n")



output_file_path_good = "Final_Bio_terms.txt"
with open(output_file_path_good, "w") as outfile:
for item in bio_term:
outfile.write(f"{item}\n")

print(f"Saved good terms to {output_file_path_good}, with {len(bio_term)} terms")


hf_med_filtered_data = filtered_med_data(output_file_path_good, dataset)

hf_med_filtered_data.save_to_disk("hf_med_filtered_data")

print(f"+--------------- Saved filtered data to {hf_med_filtered_data} ---------------+")



36 changes: 36 additions & 0 deletions src/ner_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import torch
from gliner import GLiNER

device = "cuda" if torch.cuda.is_available() else "cpu"

model = GLiNER.from_pretrained("Ihor/gliner-biomed-large-v1.0").to(device)
model.eval()
print(f"-------------- Loaded NER model to {device} --------------")

ner_labels = [
"Medicine",
"Health",
"Disease",
"Pathology",
"Pharmacology",
"Surgery",
"Nursing",
"Ophthalmology",
"Dermatology",
"Radiology",
"Immunology",
"Epidemiology",
"Neuroscience",
"Diagnosis",
"Treatment",
"Disorder",
"Medical Exams",
'Genetics',
"Medical",
'Pediatric',
'Forensic',
'Parasitology',
'Symptom',
"Injury",
"Organs"
]
Loading