MedARC-AI · Manishram-ai · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 30, 2025
diff --git a/README.md b/README.md
@@ -1,2 +1,34 @@
-# med-lm-data
-Processing post-training datasets for medicine
+# Quick Start 
+use: 
+ - uv sync
+ - uv run -m src.main
+
+
+### What it does
+- Downloads the HF dataset `RJT1990/GeneralThoughtArchive` (first 100 rows by default).
+- Loads the NER model `Ihor/gliner-biomed-large-v1.0`.
+- Scans each question (first 150 characters), detects biomedical entities across predefined labels.
+- Classifies a question as biomedical if:
+  - it contains the token "bio" (case-insensitive), or
+  - the model predicts at least one entity above the threshold (default 0.80).
+- Writes matched records to `Final_Bio_terms.txt`.
+- Reconstructs the matching rows from the original HF dataset and saves them to `hf_med_filtered_data/` via `datasets.save_to_disk`.
+
+
+## How it works
+
+- Entry point: `src/main.py`
+  - Seeds  and picks device (`cuda` if available).
+  - Iterates over `dataset['question']`, truncates to 150 chars, runs `model.predict_entities(...)`.
+  - Keeps only `{"label", "score"}` per entity for logging .
+  - Aggregates matched questions into `bio_term` and writes one JSON-like dict per line to `Final_Bio_terms.txt`.
+  - Calls `filtered_med_data(...)` to map truncated questions back to full HF rows and saves to `hf_med_filtered_data/`.
+
+- NER model: `src/ner_model.py`
+  - Loads `Ihor/gliner-biomed-large-v1.0` to CPU/GPU and exposes `model` and `ner_labels`.
+
+- Dataset: `src/hf_datasets.py`
+  - Loads split `train` from `RJT1990/GeneralThoughtArchive` 
+
+- Final- HF : `src/filtered_data.py`
+  - Matches records by truncated question text to reconstruct the original rows.
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,14 @@
+[project]
+name = "medllm-data"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "datasets>=4.1.1",
+    "gliner>=0.2.22",
+    "numpy>=2.3.3",
+    "pandas>=2.3.3",
+    "torch>=2.8.0",
+    "tqdm>=4.67.1",
+]
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/__pycache__/__init__.cpython-311.pyc b/src/__pycache__/__init__.cpython-311.pyc
diff --git a/src/__pycache__/filtered_data.cpython-311.pyc b/src/__pycache__/filtered_data.cpython-311.pyc
diff --git a/src/__pycache__/hf_datasets.cpython-311.pyc b/src/__pycache__/hf_datasets.cpython-311.pyc
diff --git a/src/__pycache__/main.cpython-311.pyc b/src/__pycache__/main.cpython-311.pyc
diff --git a/src/__pycache__/ner_model.cpython-311.pyc b/src/__pycache__/ner_model.cpython-311.pyc
diff --git a/src/filtered_data.py b/src/filtered_data.py
@@ -0,0 +1,16 @@
+import ast
+from datasets import Dataset
+
+def filtered_med_data(file_path, dataset):
+    matches = []
+    with open(file_path, "r") as f:
+        for line in f:
+            if line.strip():
+                record = ast.literal_eval(line.strip())
+                file_question = record.get("question", "")[:150]
+
+                for row in dataset:
+                    if row["question"][:150] == file_question:
+                        matches.append(row)
+
+    return Dataset.from_list(matches)
diff --git a/src/hf_datasets.py b/src/hf_datasets.py
@@ -0,0 +1,3 @@
+from datasets import load_dataset
+dataset_name = 'RJT1990/GeneralThoughtArchive'
+dataset = load_dataset(dataset_name, split='train')
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,76 @@
+from src.ner_model import model, ner_labels
+from src.hf_datasets import dataset
+from src.filtered_data import filtered_med_data
+
+import os
+import pandas as pd
+from tqdm import tqdm
+from datasets import Dataset
+import numpy as np
+import random
+from gliner import GLiNER
+import torch
+
+torch.manual_seed(42)
+torch.cuda.manual_seed_all(42)
+random.seed(42)
+np.random.seed(42)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Running on:", device)
+
+
+# Track terms
+non_bio_term = []
+bio_term = []
+
+for q in tqdm(dataset['question'], total=len(dataset)):
+    text = q[:150]  # truncate early
+    entities = model.predict_entities(text, ner_labels, threshold=0.80)
+
+    # Only keep label + score
+    filtered_entities = [
+        {"label": ent["label"], "score": ent["score"]}
+        for ent in entities
+    ]
+
+    record = {
+        "question": text,
+        "entities": filtered_entities
+    }
+
+    if "bio" in text.lower():
+        bio_term.append(record)
+        print("\n")
+        print(f"Found a bio term: {text}")
+        print(f"Entities: {filtered_entities}")
+        print("\n")
+        continue
+
+    if entities == []:
+        non_bio_term.append(record)
+    else:
+        bio_term.append(record)
+        print("\n")
+        print(f"Found a bio term: {text}")
+        print(f"Entities: {filtered_entities}")
+        print("\n")
+
+
+
+output_file_path_good = "Final_Bio_terms.txt"
+with open(output_file_path_good, "w") as outfile:
+    for item in bio_term:
+        outfile.write(f"{item}\n")
+
+print(f"Saved good terms to {output_file_path_good}, with {len(bio_term)} terms")
+
+
+hf_med_filtered_data = filtered_med_data(output_file_path_good, dataset)
+
+hf_med_filtered_data.save_to_disk("hf_med_filtered_data")
+
+print(f"+--------------- Saved filtered data to {hf_med_filtered_data} ---------------+")
+
+
+
diff --git a/src/ner_model.py b/src/ner_model.py
@@ -0,0 +1,36 @@
+import torch
+from gliner import GLiNER
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+model = GLiNER.from_pretrained("Ihor/gliner-biomed-large-v1.0").to(device)
+model.eval()
+print(f"-------------- Loaded NER model to {device} --------------")
+
+ner_labels = [
+    "Medicine",
+    "Health",
+    "Disease",
+    "Pathology",
+    "Pharmacology",
+    "Surgery",
+    "Nursing",
+    "Ophthalmology",
+    "Dermatology",
+    "Radiology",
+    "Immunology",
+    "Epidemiology",
+    "Neuroscience",
+    "Diagnosis",
+    "Treatment",
+    "Disorder",
+    "Medical Exams",
+    'Genetics',
+    "Medical",
+    'Pediatric',
+    'Forensic',
+    'Parasitology',
+    'Symptom',
+    "Injury",
+    "Organs"
+]