initial commit

nchanko · Mar 17, 2024 · df63880 · df63880
commit df63880
Show file tree

Hide file tree

Showing 8 changed files with 307 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.env
+__pycache__
diff --git a/data_generation.py b/data_generation.py
@@ -0,0 +1,100 @@
+from required_libaries import time, tqdm,csv
+from gpt_processing import process_gpt
+from file_management import save_patient,save_records
+
+# Placeholder for any specific imports or functions that are relevant to data generation but not directly defined in the overview
+
+# Function definitions related to generating patient data
+
+
+def generate_patients(user_request :str = None ,generate_time: int =1,record_count : int = 10,output_file_name:str ="patient_data.csv" ):
+  "Generated time is the number of time the function should run.Default 1"
+
+  if user_request is None:
+    user_request =""" Heart disease
+Cancer
+Chronic lower respiratory disease
+Stroke
+Alzheimer's disease
+Diabetes
+Influenza and pneumonia
+Kidney disease
+Suicide
+Septicemia
+Chronic liver disease and cirrhosis
+Hypertension
+Parkinson's disease
+Intentional self-harm
+Chronic obstructive pulmonary disease (COPD)
+Liver cancer
+Liver disease and cirrhosis
+Falls
+Leukemia
+Breast cancer, Country of origion :USA"""
+
+  for i in tqdm(range(generate_time)) :
+    #gen_recs , patient names and profiles generated by GPT.
+    gen_recs = process_gpt(str(user_request),record_count,1)
+    print("Generated time :",i+1)
+    print(gen_recs)
+
+    #Save patient
+    save_patient(gen_recs,output_file_name)
+    time.sleep(2)
+
+
+##Read patient data
+def read_patient_names():
+    with open('patient_data.csv', mode='r') as csv_file:
+        csv_reader = csv.reader(csv_file)
+        patients = []
+        next(csv_reader)  # Skip header row
+        for row in csv_reader:
+            patient = " ".join(row[:7])
+            patients.append(patient)
+    return patients
+
+def generate_records(start_index: int = 0,max_attempts:int= 10,record_count: int = 10,output_file_name:str ="medical_data3.csv"):
+  """Generate patient medica record, Input should be patient info and country of origion.
+  This code run with csv file"""
+
+  attempts = 0
+
+  last_processed_index = start_index - 1
+
+  while attempts < max_attempts:
+      try:
+          #Read patient csv file.
+          patients = read_patient_names()[start_index:]
+          total_patients = len(patients)
+
+
+          for i, patient in tqdm(enumerate(patients, start=start_index), total=total_patients):
+              print(patient)
+              try:
+                gpt_generated = process_gpt(str(patient),record_count, 2)
+               #Save patient note .
+                save_records(gpt_generated,output_file_name)
+                last_processed_index = i
+                print(f"patient {i+1} passed")
+                time.sleep(2)
+              except IndexError:
+                  print(f"IndexError: Please check the CSV file and create_record() function for patient {i+1}.")
+                  time.sleep(2)
+                  raise  # Re-raise the error to retry the current row
+              except Exception as e:
+                  time.sleep(2)
+                  print(f"Error: {e} occurred for patient {i}. Retrying...")
+                  continue  # Retry the current row if there was an error
+
+          break  # Exit loop if successful
+      except IndexError:
+          print("IndexError: Please check the CSV file and create_record() function.")
+          time.sleep(2)
+          attempts += 1
+          if attempts == max_attempts:
+              print(f"Failed after {attempts} attempts. Exiting program. Restart from {i+1}")
+          else:
+              start_index = last_processed_index + 1
+              time.sleep(2)
+              print(f"Retrying from patient {start_index}...")
diff --git a/file_management.py b/file_management.py
@@ -0,0 +1,111 @@
+
+from required_libaries import pd, time, os
+
+# Function definitions related to saving and reading data
+def check_csv(filename: str):
+  """Check if the file exists"""
+  try:
+    with open(filename, 'r') as file:
+        return True
+  except FileNotFoundError:
+      return False
+
+def save_records(gen_recs: str,outputfilename:str):
+  """Patient medical notes will be accepted"""
+  #outputfilename ="medical_record2.csv"
+
+  rec_values = gen_recs.split('|')
+  medical_record = {
+      'record_id': rec_values[0].split(":")[1].strip(),
+      'date_of_visit': rec_values[1].split(":")[1].strip(),
+      'patient_id': rec_values[2].split(":")[1].strip(),
+      'patient_name': rec_values[3].split(":")[1].strip(),
+      'age': (rec_values[4]).split(":")[1].strip(),
+      'gender': rec_values[5].split(":")[1].strip(),
+      'race': rec_values[6].split(":")[1].strip(),
+      'country_of_origin': rec_values[7].split(":")[1].strip(),
+      'chief_complaint': rec_values[8].split(":")[1].strip(),
+      'history of present illness': rec_values[9].split(":")[1].strip(),
+      'past medical and surgical history': rec_values[10].split(":")[1].strip(),
+      'immunization history': rec_values[11].split(":")[1].strip(),
+      'allergy history': rec_values[12].split(":")[1].strip(),
+      'currently taking drugs': rec_values[13].split(":")[1].strip(),
+      'social/personal history': rec_values[14].split(":")[1].strip(),
+      'investigation records': rec_values[15].split(":")[1].strip(),
+      'lab tests and results': rec_values[16].split(":")[1].strip(),
+      'differential diagnosis': rec_values[17].split(":")[1].strip(),
+      'treatment and management plan': rec_values[18].split(":")[1].strip(),
+      'additional notes': rec_values[19].split(":")[1].strip()
+  }
+
+  # create a DataFrame from the medical record dictionary
+  df = pd.DataFrame.from_dict([medical_record])
+
+  # reorder the columns to match the desired order
+  df = df[['record_id', 'date_of_visit', 'patient_id', 'patient_name', 'age', 'gender', 'race', 'country_of_origin', 'chief_complaint',
+          'history of present illness', 'past medical and surgical history', 'immunization history', 'allergy history', 'currently taking drugs',
+          'social/personal history', 'investigation records', 'lab tests and results', 'differential diagnosis', 'treatment and management plan',
+          'additional notes']]
+
+  # export the DataFrame to a CSV file
+  df.to_csv(outputfilename, mode='a', header=False,index=False)
+  print("Record saved successfully")
+
+def save_patient(patient_names: str,outfile_name:str):
+    """patient names, output file name.csv and source file name.csv will be accepted"""
+    # Load the existing patient data
+    #outfile_name = "patient_data.csv"
+    if os.path.exists(outfile_name):
+        # Load the existing patient data
+        df = pd.read_csv(outfile_name,header = None)
+        if df.empty:
+            max_id = 0
+        else:
+            # Get the maximum ID value in the existing data
+            max_id = df[0].str.replace('P', '').astype(int).max()
+    else:
+        # File doesn't exist, start max index from zero
+        max_id = 0
+    start_id = max_id+1
+
+    records = patient_names.split('\n')
+
+    # Create an empty list to store the patient data
+    patient_data = []
+
+    # Loop through each record and extract the patient information
+    # Loop through each record and extract the patient information
+    for record in records:
+        # Skip empty records
+        if not record:
+            continue
+        # Split the record into individual data fields
+        try:
+            record_data = record.split('|')
+            # Extract the patient information from the record
+            id = 'P' + str(start_id).zfill(6)
+            name = record_data[0].split('. ')[1].strip()
+            age = record_data[1].strip()
+            gender = record_data[2].strip()
+            race = record_data[3].strip()
+            country = record_data[4].strip()
+            chief_complaint = record_data[5].strip()
+            disease = record_data[6].strip()
+            # Add the patient data to the list
+            patient_data.append([id, name, age, gender, race, country, chief_complaint, disease])
+            # Increment the max_id variable
+            start_id += 1
+        except Exception as e:
+            print("Error: ", e)
+            print("Retrying after 1 seconds...")
+            time.sleep(1)
+            # Retry the same record after waiting for 5 seconds and start the id after the most recent id
+            start_id = int(patient_data[-1][0].strip('P')) + 1 if patient_data else start_id
+            continue
+
+    # Convert the list of patient data to a DataFrame
+    df = pd.DataFrame(patient_data, columns=['Patient ID', 'Name', 'Age', 'Gender', 'Race', 'Country of Origin', 'Chief Complaint', 'Disease'])
+    # Append the data to the existing file
+    print("pass")
+    df.to_csv(outfile_name, mode='a', header=False, index=False)
+    print(f"Successfully appended to file {outfile_name}")
diff --git a/gpt_processing.py b/gpt_processing.py
@@ -0,0 +1,51 @@
+
+from required_libaries import client
+
+# Placeholder for any specific imports or functions that are relevant to GPT processing but not directly defined in the overview
+
+# Function definitions related to processing or generating text with GPT models
+
+def process_gpt(user_request : str, record_count: int = 10 ,prompt_num: int = None ):
+  """user_request is the prompt from user.
+  prompt_num is the option to select system prompt,
+  1 to generate patient names.
+  2 to generate patient medical record."""
+
+
+  if prompt_num == 1:
+    system_prompt = str(f"""Generate {record_count} dummy patient records.
+    I will provide the list of differential diseases and country of origin.
+    You will answer me with full names, ages, genders, race, country of origin , chief complaint(reason to visit the doctor) ,disease.
+    Your chief complaint must be complex with overlap symptoms and should be relevant to the country of origin. Separate each line by \n and separate each cell by |.
+    Do not write extra sentences. Do not index.Example format: 1. Sarah Fish |23|Female| White| USA| Fever, headache, joint pains, and vomiting|disease. My first request is
+    .""")
+  elif prompt_num ==2:
+    system_prompt =str(f"""Generate {record_count} medical note for the given patient .I will provide you the patient_id,name,age,gender,race, country of origin and chief complaint. You will answer me these columns:
+   record_id (format : ddmmpatient_id)| date_of_visit(dd/mm/yyyy) |patient_id |patient_name |age|gender|race|country_of_origin| chief_complaint| history_of_present_illness| past_medical_and_surgical_history|immunization_history |allergy_history| currently_taking_drugs |social_personal_history |investigation_records| lab_tests_and_results| differential_diagnosis| treatment_and_management_plan| additional_notes. Each column must be separated by "|".
+   Each entry must be similar to a natural medical note can user + or - for present and absent. currently taking drugs must include name,dosage,form,frequency and duration.
+   Do not write extra sentences.
+   If more information is required to add , put in additional notes. If there is nothing to generate ,write Null.
+   The medical note must be as complete as possible and must be relevant to chief complaint. History must contain all relevant additional information related to patient chief complaint.
+   Example format is Example format: 'record_id: date_of_visit+patient_id'|'date_of_visit: dd/mm/yyyy'| 'patient_id: P000003'| 'patient_name: Aung Soe'| 'age: 55'|'gender: Male'| 'race: Asian'| 'country_of_origin: Myanmar'|'chief_complaint: Extreme thirst, frequent urination, numbness or tingling of feet.'|'history_of_present_illness: explain here'|'past_medical_and_surgical_history: explain here'| 'immunization_history: explain here'| 'allergy_history: explain here'| 'currently_taking_drugs: '| 'social_personal_history: explain here'| 'investigation_records: explain here'| 'lab_tests_and_results: explain here', 'differential_diagnosis: explain here' , 'treatment_and_management_plan: explain here', 'additional_notes: explain here.'
+My first request is
+
+    """)
+  elif prompt_num ==3:
+    system_prompt=""
+
+  else:
+    print("No prompt found")
+
+  # Generate patient information using OpenAI API
+  messages = [{"role": "system", "content":system_prompt },
+          {"role": "user", "content": user_request}]
+
+
+  model = "gpt-3.5-turbo"
+  response = client.chat.completions.create(model=model, messages=messages, max_tokens=2000)
+  generations = response.choices[0].message.content
+
+  return generations
+
+
+
diff --git a/main.py b/main.py
@@ -0,0 +1,12 @@
+from data_generation import generate_records,generate_patients
+# Placeholder for any specific imports or code snippets that are relevant to orchestrating the workflow but not directly defined in the overview
+
+# Main function definition
+
+def main():
+  #create patient names
+  generate_patients(generate_time = 1, record_count =5, output_file_name ="patient_data.csv")
+  #create patient's records
+  generate_records(start_index=0,record_count=5, output_file_name="medical_data.csv")
+
+main()