-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit df63880
Showing
8 changed files
with
307 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
.env | ||
__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
from required_libaries import time, tqdm,csv | ||
from gpt_processing import process_gpt | ||
from file_management import save_patient,save_records | ||
|
||
# Placeholder for any specific imports or functions that are relevant to data generation but not directly defined in the overview | ||
|
||
# Function definitions related to generating patient data | ||
|
||
|
||
def generate_patients(user_request :str = None ,generate_time: int =1,record_count : int = 10,output_file_name:str ="patient_data.csv" ): | ||
"Generated time is the number of time the function should run.Default 1" | ||
|
||
if user_request is None: | ||
user_request =""" Heart disease | ||
Cancer | ||
Chronic lower respiratory disease | ||
Stroke | ||
Alzheimer's disease | ||
Diabetes | ||
Influenza and pneumonia | ||
Kidney disease | ||
Suicide | ||
Septicemia | ||
Chronic liver disease and cirrhosis | ||
Hypertension | ||
Parkinson's disease | ||
Intentional self-harm | ||
Chronic obstructive pulmonary disease (COPD) | ||
Liver cancer | ||
Liver disease and cirrhosis | ||
Falls | ||
Leukemia | ||
Breast cancer, Country of origion :USA""" | ||
|
||
for i in tqdm(range(generate_time)) : | ||
#gen_recs , patient names and profiles generated by GPT. | ||
gen_recs = process_gpt(str(user_request),record_count,1) | ||
print("Generated time :",i+1) | ||
print(gen_recs) | ||
|
||
#Save patient | ||
save_patient(gen_recs,output_file_name) | ||
time.sleep(2) | ||
|
||
|
||
##Read patient data | ||
def read_patient_names(): | ||
with open('patient_data.csv', mode='r') as csv_file: | ||
csv_reader = csv.reader(csv_file) | ||
patients = [] | ||
next(csv_reader) # Skip header row | ||
for row in csv_reader: | ||
patient = " ".join(row[:7]) | ||
patients.append(patient) | ||
return patients | ||
|
||
def generate_records(start_index: int = 0,max_attempts:int= 10,record_count: int = 10,output_file_name:str ="medical_data3.csv"): | ||
"""Generate patient medica record, Input should be patient info and country of origion. | ||
This code run with csv file""" | ||
|
||
attempts = 0 | ||
|
||
last_processed_index = start_index - 1 | ||
|
||
while attempts < max_attempts: | ||
try: | ||
#Read patient csv file. | ||
patients = read_patient_names()[start_index:] | ||
total_patients = len(patients) | ||
|
||
|
||
for i, patient in tqdm(enumerate(patients, start=start_index), total=total_patients): | ||
print(patient) | ||
try: | ||
gpt_generated = process_gpt(str(patient),record_count, 2) | ||
#Save patient note . | ||
save_records(gpt_generated,output_file_name) | ||
last_processed_index = i | ||
print(f"patient {i+1} passed") | ||
time.sleep(2) | ||
except IndexError: | ||
print(f"IndexError: Please check the CSV file and create_record() function for patient {i+1}.") | ||
time.sleep(2) | ||
raise # Re-raise the error to retry the current row | ||
except Exception as e: | ||
time.sleep(2) | ||
print(f"Error: {e} occurred for patient {i}. Retrying...") | ||
continue # Retry the current row if there was an error | ||
|
||
break # Exit loop if successful | ||
except IndexError: | ||
print("IndexError: Please check the CSV file and create_record() function.") | ||
time.sleep(2) | ||
attempts += 1 | ||
if attempts == max_attempts: | ||
print(f"Failed after {attempts} attempts. Exiting program. Restart from {i+1}") | ||
else: | ||
start_index = last_processed_index + 1 | ||
time.sleep(2) | ||
print(f"Retrying from patient {start_index}...") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
|
||
from required_libaries import pd, time, os | ||
|
||
# Function definitions related to saving and reading data | ||
def check_csv(filename: str): | ||
"""Check if the file exists""" | ||
try: | ||
with open(filename, 'r') as file: | ||
return True | ||
except FileNotFoundError: | ||
return False | ||
|
||
def save_records(gen_recs: str,outputfilename:str): | ||
"""Patient medical notes will be accepted""" | ||
#outputfilename ="medical_record2.csv" | ||
|
||
rec_values = gen_recs.split('|') | ||
medical_record = { | ||
'record_id': rec_values[0].split(":")[1].strip(), | ||
'date_of_visit': rec_values[1].split(":")[1].strip(), | ||
'patient_id': rec_values[2].split(":")[1].strip(), | ||
'patient_name': rec_values[3].split(":")[1].strip(), | ||
'age': (rec_values[4]).split(":")[1].strip(), | ||
'gender': rec_values[5].split(":")[1].strip(), | ||
'race': rec_values[6].split(":")[1].strip(), | ||
'country_of_origin': rec_values[7].split(":")[1].strip(), | ||
'chief_complaint': rec_values[8].split(":")[1].strip(), | ||
'history of present illness': rec_values[9].split(":")[1].strip(), | ||
'past medical and surgical history': rec_values[10].split(":")[1].strip(), | ||
'immunization history': rec_values[11].split(":")[1].strip(), | ||
'allergy history': rec_values[12].split(":")[1].strip(), | ||
'currently taking drugs': rec_values[13].split(":")[1].strip(), | ||
'social/personal history': rec_values[14].split(":")[1].strip(), | ||
'investigation records': rec_values[15].split(":")[1].strip(), | ||
'lab tests and results': rec_values[16].split(":")[1].strip(), | ||
'differential diagnosis': rec_values[17].split(":")[1].strip(), | ||
'treatment and management plan': rec_values[18].split(":")[1].strip(), | ||
'additional notes': rec_values[19].split(":")[1].strip() | ||
} | ||
|
||
# create a DataFrame from the medical record dictionary | ||
df = pd.DataFrame.from_dict([medical_record]) | ||
|
||
# reorder the columns to match the desired order | ||
df = df[['record_id', 'date_of_visit', 'patient_id', 'patient_name', 'age', 'gender', 'race', 'country_of_origin', 'chief_complaint', | ||
'history of present illness', 'past medical and surgical history', 'immunization history', 'allergy history', 'currently taking drugs', | ||
'social/personal history', 'investigation records', 'lab tests and results', 'differential diagnosis', 'treatment and management plan', | ||
'additional notes']] | ||
|
||
# export the DataFrame to a CSV file | ||
df.to_csv(outputfilename, mode='a', header=False,index=False) | ||
print("Record saved successfully") | ||
|
||
def save_patient(patient_names: str,outfile_name:str): | ||
"""patient names, output file name.csv and source file name.csv will be accepted""" | ||
# Load the existing patient data | ||
#outfile_name = "patient_data.csv" | ||
if os.path.exists(outfile_name): | ||
# Load the existing patient data | ||
df = pd.read_csv(outfile_name,header = None) | ||
if df.empty: | ||
max_id = 0 | ||
else: | ||
# Get the maximum ID value in the existing data | ||
max_id = df[0].str.replace('P', '').astype(int).max() | ||
else: | ||
# File doesn't exist, start max index from zero | ||
max_id = 0 | ||
start_id = max_id+1 | ||
|
||
records = patient_names.split('\n') | ||
|
||
# Create an empty list to store the patient data | ||
patient_data = [] | ||
|
||
# Loop through each record and extract the patient information | ||
# Loop through each record and extract the patient information | ||
for record in records: | ||
# Skip empty records | ||
if not record: | ||
continue | ||
# Split the record into individual data fields | ||
try: | ||
record_data = record.split('|') | ||
# Extract the patient information from the record | ||
id = 'P' + str(start_id).zfill(6) | ||
name = record_data[0].split('. ')[1].strip() | ||
age = record_data[1].strip() | ||
gender = record_data[2].strip() | ||
race = record_data[3].strip() | ||
country = record_data[4].strip() | ||
chief_complaint = record_data[5].strip() | ||
disease = record_data[6].strip() | ||
# Add the patient data to the list | ||
patient_data.append([id, name, age, gender, race, country, chief_complaint, disease]) | ||
# Increment the max_id variable | ||
start_id += 1 | ||
except Exception as e: | ||
print("Error: ", e) | ||
print("Retrying after 1 seconds...") | ||
time.sleep(1) | ||
# Retry the same record after waiting for 5 seconds and start the id after the most recent id | ||
start_id = int(patient_data[-1][0].strip('P')) + 1 if patient_data else start_id | ||
continue | ||
|
||
# Convert the list of patient data to a DataFrame | ||
df = pd.DataFrame(patient_data, columns=['Patient ID', 'Name', 'Age', 'Gender', 'Race', 'Country of Origin', 'Chief Complaint', 'Disease']) | ||
# Append the data to the existing file | ||
print("pass") | ||
df.to_csv(outfile_name, mode='a', header=False, index=False) | ||
print(f"Successfully appended to file {outfile_name}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
|
||
from required_libaries import client | ||
|
||
# Placeholder for any specific imports or functions that are relevant to GPT processing but not directly defined in the overview | ||
|
||
# Function definitions related to processing or generating text with GPT models | ||
|
||
def process_gpt(user_request : str, record_count: int = 10 ,prompt_num: int = None ): | ||
"""user_request is the prompt from user. | ||
prompt_num is the option to select system prompt, | ||
1 to generate patient names. | ||
2 to generate patient medical record.""" | ||
|
||
|
||
if prompt_num == 1: | ||
system_prompt = str(f"""Generate {record_count} dummy patient records. | ||
I will provide the list of differential diseases and country of origin. | ||
You will answer me with full names, ages, genders, race, country of origin , chief complaint(reason to visit the doctor) ,disease. | ||
Your chief complaint must be complex with overlap symptoms and should be relevant to the country of origin. Separate each line by \n and separate each cell by |. | ||
Do not write extra sentences. Do not index.Example format: 1. Sarah Fish |23|Female| White| USA| Fever, headache, joint pains, and vomiting|disease. My first request is | ||
.""") | ||
elif prompt_num ==2: | ||
system_prompt =str(f"""Generate {record_count} medical note for the given patient .I will provide you the patient_id,name,age,gender,race, country of origin and chief complaint. You will answer me these columns: | ||
record_id (format : ddmmpatient_id)| date_of_visit(dd/mm/yyyy) |patient_id |patient_name |age|gender|race|country_of_origin| chief_complaint| history_of_present_illness| past_medical_and_surgical_history|immunization_history |allergy_history| currently_taking_drugs |social_personal_history |investigation_records| lab_tests_and_results| differential_diagnosis| treatment_and_management_plan| additional_notes. Each column must be separated by "|". | ||
Each entry must be similar to a natural medical note can user + or - for present and absent. currently taking drugs must include name,dosage,form,frequency and duration. | ||
Do not write extra sentences. | ||
If more information is required to add , put in additional notes. If there is nothing to generate ,write Null. | ||
The medical note must be as complete as possible and must be relevant to chief complaint. History must contain all relevant additional information related to patient chief complaint. | ||
Example format is Example format: 'record_id: date_of_visit+patient_id'|'date_of_visit: dd/mm/yyyy'| 'patient_id: P000003'| 'patient_name: Aung Soe'| 'age: 55'|'gender: Male'| 'race: Asian'| 'country_of_origin: Myanmar'|'chief_complaint: Extreme thirst, frequent urination, numbness or tingling of feet.'|'history_of_present_illness: explain here'|'past_medical_and_surgical_history: explain here'| 'immunization_history: explain here'| 'allergy_history: explain here'| 'currently_taking_drugs: '| 'social_personal_history: explain here'| 'investigation_records: explain here'| 'lab_tests_and_results: explain here', 'differential_diagnosis: explain here' , 'treatment_and_management_plan: explain here', 'additional_notes: explain here.' | ||
My first request is | ||
""") | ||
elif prompt_num ==3: | ||
system_prompt="" | ||
|
||
else: | ||
print("No prompt found") | ||
|
||
# Generate patient information using OpenAI API | ||
messages = [{"role": "system", "content":system_prompt }, | ||
{"role": "user", "content": user_request}] | ||
|
||
|
||
model = "gpt-3.5-turbo" | ||
response = client.chat.completions.create(model=model, messages=messages, max_tokens=2000) | ||
generations = response.choices[0].message.content | ||
|
||
return generations | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from data_generation import generate_records,generate_patients | ||
# Placeholder for any specific imports or code snippets that are relevant to orchestrating the workflow but not directly defined in the overview | ||
|
||
# Main function definition | ||
|
||
def main(): | ||
#create patient names | ||
generate_patients(generate_time = 1, record_count =5, output_file_name ="patient_data.csv") | ||
#create patient's records | ||
generate_records(start_index=0,record_count=5, output_file_name="medical_data.csv") | ||
|
||
main() |
Oops, something went wrong.