Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
nchanko committed Mar 17, 2024
0 parents commit df63880
Show file tree
Hide file tree
Showing 8 changed files with 307 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.env
__pycache__
100 changes: 100 additions & 0 deletions data_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from required_libaries import time, tqdm,csv
from gpt_processing import process_gpt
from file_management import save_patient,save_records

# Placeholder for any specific imports or functions that are relevant to data generation but not directly defined in the overview

# Function definitions related to generating patient data


def generate_patients(user_request :str = None ,generate_time: int =1,record_count : int = 10,output_file_name:str ="patient_data.csv" ):
"Generated time is the number of time the function should run.Default 1"

if user_request is None:
user_request =""" Heart disease
Cancer
Chronic lower respiratory disease
Stroke
Alzheimer's disease
Diabetes
Influenza and pneumonia
Kidney disease
Suicide
Septicemia
Chronic liver disease and cirrhosis
Hypertension
Parkinson's disease
Intentional self-harm
Chronic obstructive pulmonary disease (COPD)
Liver cancer
Liver disease and cirrhosis
Falls
Leukemia
Breast cancer, Country of origion :USA"""

for i in tqdm(range(generate_time)) :
#gen_recs , patient names and profiles generated by GPT.
gen_recs = process_gpt(str(user_request),record_count,1)
print("Generated time :",i+1)
print(gen_recs)

#Save patient
save_patient(gen_recs,output_file_name)
time.sleep(2)


##Read patient data
def read_patient_names():
with open('patient_data.csv', mode='r') as csv_file:
csv_reader = csv.reader(csv_file)
patients = []
next(csv_reader) # Skip header row
for row in csv_reader:
patient = " ".join(row[:7])
patients.append(patient)
return patients

def generate_records(start_index: int = 0,max_attempts:int= 10,record_count: int = 10,output_file_name:str ="medical_data3.csv"):
"""Generate patient medica record, Input should be patient info and country of origion.
This code run with csv file"""

attempts = 0

last_processed_index = start_index - 1

while attempts < max_attempts:
try:
#Read patient csv file.
patients = read_patient_names()[start_index:]
total_patients = len(patients)


for i, patient in tqdm(enumerate(patients, start=start_index), total=total_patients):
print(patient)
try:
gpt_generated = process_gpt(str(patient),record_count, 2)
#Save patient note .
save_records(gpt_generated,output_file_name)
last_processed_index = i
print(f"patient {i+1} passed")
time.sleep(2)
except IndexError:
print(f"IndexError: Please check the CSV file and create_record() function for patient {i+1}.")
time.sleep(2)
raise # Re-raise the error to retry the current row
except Exception as e:
time.sleep(2)
print(f"Error: {e} occurred for patient {i}. Retrying...")
continue # Retry the current row if there was an error

break # Exit loop if successful
except IndexError:
print("IndexError: Please check the CSV file and create_record() function.")
time.sleep(2)
attempts += 1
if attempts == max_attempts:
print(f"Failed after {attempts} attempts. Exiting program. Restart from {i+1}")
else:
start_index = last_processed_index + 1
time.sleep(2)
print(f"Retrying from patient {start_index}...")
111 changes: 111 additions & 0 deletions file_management.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@

from required_libaries import pd, time, os

# Function definitions related to saving and reading data
def check_csv(filename: str):
"""Check if the file exists"""
try:
with open(filename, 'r') as file:
return True
except FileNotFoundError:
return False

def save_records(gen_recs: str,outputfilename:str):
"""Patient medical notes will be accepted"""
#outputfilename ="medical_record2.csv"

rec_values = gen_recs.split('|')
medical_record = {
'record_id': rec_values[0].split(":")[1].strip(),
'date_of_visit': rec_values[1].split(":")[1].strip(),
'patient_id': rec_values[2].split(":")[1].strip(),
'patient_name': rec_values[3].split(":")[1].strip(),
'age': (rec_values[4]).split(":")[1].strip(),
'gender': rec_values[5].split(":")[1].strip(),
'race': rec_values[6].split(":")[1].strip(),
'country_of_origin': rec_values[7].split(":")[1].strip(),
'chief_complaint': rec_values[8].split(":")[1].strip(),
'history of present illness': rec_values[9].split(":")[1].strip(),
'past medical and surgical history': rec_values[10].split(":")[1].strip(),
'immunization history': rec_values[11].split(":")[1].strip(),
'allergy history': rec_values[12].split(":")[1].strip(),
'currently taking drugs': rec_values[13].split(":")[1].strip(),
'social/personal history': rec_values[14].split(":")[1].strip(),
'investigation records': rec_values[15].split(":")[1].strip(),
'lab tests and results': rec_values[16].split(":")[1].strip(),
'differential diagnosis': rec_values[17].split(":")[1].strip(),
'treatment and management plan': rec_values[18].split(":")[1].strip(),
'additional notes': rec_values[19].split(":")[1].strip()
}

# create a DataFrame from the medical record dictionary
df = pd.DataFrame.from_dict([medical_record])

# reorder the columns to match the desired order
df = df[['record_id', 'date_of_visit', 'patient_id', 'patient_name', 'age', 'gender', 'race', 'country_of_origin', 'chief_complaint',
'history of present illness', 'past medical and surgical history', 'immunization history', 'allergy history', 'currently taking drugs',
'social/personal history', 'investigation records', 'lab tests and results', 'differential diagnosis', 'treatment and management plan',
'additional notes']]

# export the DataFrame to a CSV file
df.to_csv(outputfilename, mode='a', header=False,index=False)
print("Record saved successfully")

def save_patient(patient_names: str,outfile_name:str):
"""patient names, output file name.csv and source file name.csv will be accepted"""
# Load the existing patient data
#outfile_name = "patient_data.csv"
if os.path.exists(outfile_name):
# Load the existing patient data
df = pd.read_csv(outfile_name,header = None)
if df.empty:
max_id = 0
else:
# Get the maximum ID value in the existing data
max_id = df[0].str.replace('P', '').astype(int).max()
else:
# File doesn't exist, start max index from zero
max_id = 0
start_id = max_id+1

records = patient_names.split('\n')

# Create an empty list to store the patient data
patient_data = []

# Loop through each record and extract the patient information
# Loop through each record and extract the patient information
for record in records:
# Skip empty records
if not record:
continue
# Split the record into individual data fields
try:
record_data = record.split('|')
# Extract the patient information from the record
id = 'P' + str(start_id).zfill(6)
name = record_data[0].split('. ')[1].strip()
age = record_data[1].strip()
gender = record_data[2].strip()
race = record_data[3].strip()
country = record_data[4].strip()
chief_complaint = record_data[5].strip()
disease = record_data[6].strip()
# Add the patient data to the list
patient_data.append([id, name, age, gender, race, country, chief_complaint, disease])
# Increment the max_id variable
start_id += 1
except Exception as e:
print("Error: ", e)
print("Retrying after 1 seconds...")
time.sleep(1)
# Retry the same record after waiting for 5 seconds and start the id after the most recent id
start_id = int(patient_data[-1][0].strip('P')) + 1 if patient_data else start_id
continue

# Convert the list of patient data to a DataFrame
df = pd.DataFrame(patient_data, columns=['Patient ID', 'Name', 'Age', 'Gender', 'Race', 'Country of Origin', 'Chief Complaint', 'Disease'])
# Append the data to the existing file
print("pass")
df.to_csv(outfile_name, mode='a', header=False, index=False)
print(f"Successfully appended to file {outfile_name}")
51 changes: 51 additions & 0 deletions gpt_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

from required_libaries import client

# Placeholder for any specific imports or functions that are relevant to GPT processing but not directly defined in the overview

# Function definitions related to processing or generating text with GPT models

def process_gpt(user_request : str, record_count: int = 10 ,prompt_num: int = None ):
"""user_request is the prompt from user.
prompt_num is the option to select system prompt,
1 to generate patient names.
2 to generate patient medical record."""


if prompt_num == 1:
system_prompt = str(f"""Generate {record_count} dummy patient records.
I will provide the list of differential diseases and country of origin.
You will answer me with full names, ages, genders, race, country of origin , chief complaint(reason to visit the doctor) ,disease.
Your chief complaint must be complex with overlap symptoms and should be relevant to the country of origin. Separate each line by \n and separate each cell by |.
Do not write extra sentences. Do not index.Example format: 1. Sarah Fish |23|Female| White| USA| Fever, headache, joint pains, and vomiting|disease. My first request is
.""")
elif prompt_num ==2:
system_prompt =str(f"""Generate {record_count} medical note for the given patient .I will provide you the patient_id,name,age,gender,race, country of origin and chief complaint. You will answer me these columns:
record_id (format : ddmmpatient_id)| date_of_visit(dd/mm/yyyy) |patient_id |patient_name |age|gender|race|country_of_origin| chief_complaint| history_of_present_illness| past_medical_and_surgical_history|immunization_history |allergy_history| currently_taking_drugs |social_personal_history |investigation_records| lab_tests_and_results| differential_diagnosis| treatment_and_management_plan| additional_notes. Each column must be separated by "|".
Each entry must be similar to a natural medical note can user + or - for present and absent. currently taking drugs must include name,dosage,form,frequency and duration.
Do not write extra sentences.
If more information is required to add , put in additional notes. If there is nothing to generate ,write Null.
The medical note must be as complete as possible and must be relevant to chief complaint. History must contain all relevant additional information related to patient chief complaint.
Example format is Example format: 'record_id: date_of_visit+patient_id'|'date_of_visit: dd/mm/yyyy'| 'patient_id: P000003'| 'patient_name: Aung Soe'| 'age: 55'|'gender: Male'| 'race: Asian'| 'country_of_origin: Myanmar'|'chief_complaint: Extreme thirst, frequent urination, numbness or tingling of feet.'|'history_of_present_illness: explain here'|'past_medical_and_surgical_history: explain here'| 'immunization_history: explain here'| 'allergy_history: explain here'| 'currently_taking_drugs: '| 'social_personal_history: explain here'| 'investigation_records: explain here'| 'lab_tests_and_results: explain here', 'differential_diagnosis: explain here' , 'treatment_and_management_plan: explain here', 'additional_notes: explain here.'
My first request is
""")
elif prompt_num ==3:
system_prompt=""

else:
print("No prompt found")

# Generate patient information using OpenAI API
messages = [{"role": "system", "content":system_prompt },
{"role": "user", "content": user_request}]


model = "gpt-3.5-turbo"
response = client.chat.completions.create(model=model, messages=messages, max_tokens=2000)
generations = response.choices[0].message.content

return generations



12 changes: 12 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from data_generation import generate_records,generate_patients
# Placeholder for any specific imports or code snippets that are relevant to orchestrating the workflow but not directly defined in the overview

# Main function definition

def main():
#create patient names
generate_patients(generate_time = 1, record_count =5, output_file_name ="patient_data.csv")
#create patient's records
generate_records(start_index=0,record_count=5, output_file_name="medical_data.csv")

main()
Loading

0 comments on commit df63880

Please sign in to comment.