From c0f8c1821db715c4f91415d995633901776c7d72 Mon Sep 17 00:00:00 2001 From: Darren Wu Date: Thu, 28 Jul 2022 19:15:08 -0500 Subject: [PATCH 1/2] modules added --- Complete - Browse file.ipynb | 9 ++++-- Complete - User path.ipynb | 52 ++++++++++++++++++++++++++++---- DARREN_CHLOE_README.md | 57 ++++++++++++++++++++++++++++++++++++ case_study.ipynb | 0 4 files changed, 110 insertions(+), 8 deletions(-) create mode 100644 DARREN_CHLOE_README.md create mode 100644 case_study.ipynb diff --git a/Complete - Browse file.ipynb b/Complete - Browse file.ipynb index a11ab34..e93e2a0 100644 --- a/Complete - Browse file.ipynb +++ b/Complete - Browse file.ipynb @@ -598,7 +598,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.12 ('base')", "language": "python", "name": "python3" }, @@ -612,7 +612,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.12" + }, + "vscode": { + "interpreter": { + "hash": "6cc6f592eb6bc202b06fc871664df7c8ed579442d888ac59e98ddf8873150899" + } } }, "nbformat": 4, diff --git a/Complete - User path.ipynb b/Complete - User path.ipynb index 5cfe67a..0d1c3b9 100644 --- a/Complete - User path.ipynb +++ b/Complete - User path.ipynb @@ -18,7 +18,8 @@ "import json\n", "import rowingdata \n", "import mne\n", - "import re" + "import re\n", + "from sklearn import preprocessing" ] }, { @@ -70,11 +71,11 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "def apple(readcsv, user_input, process_df, watch_choice):\n", + "def apple(readcsv, user_input, process_df, col_names, category_list, watch_choice):\n", " \n", " def main(df, watch_choice):\n", " dictdf, df = dict_df(df)\n", @@ -83,7 +84,12 @@ " df = pre_process_apple(df)\n", " df = process_df(df, watch_choice)\n", " df = add_time(df, start_time)\n", - " return dictdf, df\n", + " normalize_df = normalize(df, col_names)\n", + " standardize_df = standardize(df, col_names)\n", + " binary_one_hot_encode = oneHotEncode(df, category_list)\n", + " maxAbsScale_df = minMaxScaler(df, col_names)\n", + " standardScale_df = standardScaler(df, col_names)\n", + " return dictdf, df, normalize_df, standardize_df, binary_one_hot_encode, maxAbsScale_df, standardScale_df\n", "\n", " def dict_df(df):\n", " for i in range(0,len(df)):\n", @@ -115,6 +121,35 @@ " def rename_cols(df):\n", " df = df.rename(columns ={'Time_(seconds)' : 'Elapse_time_(sec)'})\n", " return df\n", + "\n", + " def normalize(df, column_names):\n", + " for column in column in column_names:\n", + " scaler = preprocessing.normalize()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def standardize(df, column_names):\n", + " for column in column_names:\n", + " scaler = preprocessing.StandardScaler()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def oneHotEncode(df, category_list):\n", + " encoder = preprocessing.OneHotEncoder(handle_unknown = 'ignore')\n", + " encoder.fit(df)\n", + " return encoder.transform(data_list).toarray()\n", + " \n", + " def minMaxScaler(df, column_names):\n", + " for column in column_names:\n", + " MinMaxScaler = preprocessing.MinMaxScaler()\n", + " df[column] = MinMaxScaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def standardScaler(df, column_names):\n", + " for column in column_names:\n", + " StandardScaler = preprocessing.StandardScaler()\n", + " df[column] = StandardScaler.fit_transform(df[column])\n", + " return df\n", " \n", " def output_dict(dictdf,df, no_patient, watch_choice):\n", " names = []\n", @@ -586,7 +621,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.12 ('base')", "language": "python", "name": "python3" }, @@ -600,7 +635,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.9.12" + }, + "vscode": { + "interpreter": { + "hash": "6cc6f592eb6bc202b06fc871664df7c8ed579442d888ac59e98ddf8873150899" + } } }, "nbformat": 4, diff --git a/DARREN_CHLOE_README.md b/DARREN_CHLOE_README.md new file mode 100644 index 0000000..7695e41 --- /dev/null +++ b/DARREN_CHLOE_README.md @@ -0,0 +1,57 @@ +# Case Study Documentation + + +# Pre-processing Function Updates ("Complete - User path.ipynb") + +Here is a list of new functions (modules) added to the pre-processing pipeline for DBDP + +1. Standardization + +# standardize(df, col_names) + +Params: +Output: + +Description: + + +2. Normalization + +# normalize(df, col_names) + +Params: +Output: + +Description: + +3. One Hot Encoding + +# oneHotEncode(df, category_List) + +Params: +Output: + +Description: + +4. MinMax Scaling + +# minMaxScaler(df, column_names) + +Params: +Output: + +Description + +5. Standard Scaling + +# StandardScaler(df, column_names) + +Params: +Output: + +Description + +6. Data type conversions + +Params: +Output: \ No newline at end of file diff --git a/case_study.ipynb b/case_study.ipynb new file mode 100644 index 0000000..e69de29 From cda6a3d9cff3ae41dd9e08d2ee049500b0255da1 Mon Sep 17 00:00:00 2001 From: Darren Wu Date: Fri, 29 Jul 2022 00:02:54 -0500 Subject: [PATCH 2/2] documentation finished for module --- AppleWatch.csv | 120 ++++++++++++++++++ Complete - User path.ipynb | 241 ++++++++++++++++++++++++++++++++++--- DARREN_CHLOE_README.md | 38 +++--- apple.csv | 108 +++++++++++++++++ case_study.ipynb | 0 5 files changed, 474 insertions(+), 33 deletions(-) create mode 100644 AppleWatch.csv create mode 100644 apple.csv delete mode 100644 case_study.ipynb diff --git a/AppleWatch.csv b/AppleWatch.csv new file mode 100644 index 0000000..9b54828 --- /dev/null +++ b/AppleWatch.csv @@ -0,0 +1,120 @@ +Workout date, 2019-07-17 11:49:53 +Duration, 575 +Calories burned, 0 +Mean heart rate, 88.12 +Maximum heart rate, 108 +Notes, Other exercise + +Time (seconds), Rate (beats per minute) + 12.6, 69 + 17.6, 65 + 21.6, 66 + 25.5, 67 + 29.5, 67 + 33.5, 73 + 42.5, 92 + 46.5, 94 + 51.5, 97 + 53.5, 98 + 62.5, 102 + 66.5, 102 + 68.5, 102 + 75.5, 102 + 82.5, 105 + 83.5, 105 + 89.5, 103 + 93.5, 104 + 101.5, 96 + 106.5, 97 + 112.5, 101 + 117.5, 100 + 118.5, 101 + 125.5, 103 + 128.5, 103 + 136.5, 108 + 138.5, 107 + 144.5, 103 + 149.5, 103 + 156.5, 101 + 160.5, 101 + 163.5, 102 + 172.5, 106 + 177.5, 106 + 182.5, 106 + 185.5, 107 + 191.5, 106 + 197.5, 106 + 199.5, 104 + 206.5, 99 + 211.5, 101 + 217.5, 102 + 220.5, 103 + 223.5, 106 + 229.5, 106 + 235.5, 103 + 242.5, 104 + 244.5, 105 + 248.5, 106 + 257.5, 107 + 258.5, 108 + 263.5, 106 + 272.8, 104 + 277.8, 104 + 278.8, 105 + 285.8, 101 + 292.8, 105 + 297.8, 104 + 298.8, 104 + 307.8, 104 + 309.8, 104 + 313.8, 105 + 319.8, 101 + 327.8, 105 + 331.8, 104 + 337.8, 98 + 340.8, 97 + 346.8, 95 + 348.8, 95 + 356.8, 96 + 358.8, 96 + 363.8, 97 + 370.8, 81 + 377.8, 72 + 378.8, 73 + 385.8, 72 + 392.8, 72 + 396.8, 73 + 399.8, 70 + 407.8, 70 + 411.8, 72 + 413.8, 72 + 421.8, 66 + 427.8, 69 + 430.8, 71 + 436.8, 64 + 439.8, 65 + 446.8, 75 + 449.8, 73 + 457.8, 69 + 459.8, 72 + 467.8, 67 + 471.8, 66 + 473.8, 65 + 482.8, 61 + 485.8, 60 + 491.8, 68 + 494.8, 72 + 502.8, 67 + 507.8, 65 + 512.8, 64 + 513.8, 64 + 518.8, 64 + 526.8, 65 + 528.8, 64 + 534.8, 64 + 542.8, 66 + 544.8, 66 + 548.8, 65 + 557.8, 74 + 558.8, 73 + 566.0, 67 diff --git a/Complete - User path.ipynb b/Complete - User path.ipynb index 0d1c3b9..0b8807a 100644 --- a/Complete - User path.ipynb +++ b/Complete - User path.ipynb @@ -88,7 +88,7 @@ " standardize_df = standardize(df, col_names)\n", " binary_one_hot_encode = oneHotEncode(df, category_list)\n", " maxAbsScale_df = minMaxScaler(df, col_names)\n", - " standardScale_df = standardScaler(df, col_names)\n", + " standardScale_df = maxAbsScaler(df, col_names)\n", " return dictdf, df, normalize_df, standardize_df, binary_one_hot_encode, maxAbsScale_df, standardScale_df\n", "\n", " def dict_df(df):\n", @@ -137,7 +137,7 @@ " def oneHotEncode(df, category_list):\n", " encoder = preprocessing.OneHotEncoder(handle_unknown = 'ignore')\n", " encoder.fit(df)\n", - " return encoder.transform(data_list).toarray()\n", + " return encoder.transform(category_list).toarray()\n", " \n", " def minMaxScaler(df, column_names):\n", " for column in column_names:\n", @@ -145,10 +145,10 @@ " df[column] = MinMaxScaler.fit_transform(df[column])\n", " return df\n", " \n", - " def standardScaler(df, column_names):\n", + " def maxAbsScaler(df, column_names):\n", " for column in column_names:\n", - " StandardScaler = preprocessing.StandardScaler()\n", - " df[column] = StandardScaler.fit_transform(df[column])\n", + " maxAbsScaler = preprocessing.MaxAbsScaler()\n", + " df[column] = maxAbsScaler.fit_transform(df[column])\n", " return df\n", " \n", " def output_dict(dictdf,df, no_patient, watch_choice):\n", @@ -174,14 +174,19 @@ "metadata": {}, "outputs": [], "source": [ - "def fitbit(readcsv, user_input, process_df, output, watch_choice):\n", + "def fitbit(readcsv, user_input, process_df, output, col_names, category_list, watch_choice):\n", " \n", " def main(df, watch_choice):\n", " df = process_df(df, watch_choice)\n", " start_time = df.iloc[0][0]\n", " df = add_time(df, start_time)\n", " df = rename_cols(df)\n", - " return df\n", + " normalize_df = normalize(df, col_names)\n", + " standardize_df = standardize(df, col_names)\n", + " binary_one_hot_encode = oneHotEncode(df, category_list)\n", + " maxAbsScale_df = minMaxScaler(df, col_names)\n", + " standardScale_df = maxAbsScaler(df, col_names)\n", + " return df, normalize_df, standardize_df, binary_one_hot_encode, maxAbsScale_df, standardScale_df\n", " \n", " def add_time(df, start_time):\n", " df['Elapsed_time_(sec)'] = 'NaN'\n", @@ -194,6 +199,35 @@ " df = df.rename(columns ={'Time' : 'Actual_time', 'Heart_Rate' : 'Rate_(beats_per_minute)'})\n", " return df\n", " \n", + " def normalize(df, column_names):\n", + " for column in column in column_names:\n", + " scaler = preprocessing.normalize()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def standardize(df, column_names):\n", + " for column in column_names:\n", + " scaler = preprocessing.StandardScaler()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def oneHotEncode(df, category_list):\n", + " encoder = preprocessing.OneHotEncoder(handle_unknown = 'ignore')\n", + " encoder.fit(df)\n", + " return encoder.transform(category_list).toarray()\n", + " \n", + " def minMaxScaler(df, column_names):\n", + " for column in column_names:\n", + " MinMaxScaler = preprocessing.MinMaxScaler()\n", + " df[column] = MinMaxScaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def maxAbsScaler(df, column_names):\n", + " for column in column_names:\n", + " maxAbsScaler = preprocessing.MaxAbsScaler()\n", + " df[column] = maxAbsScaler.fit_transform(df[column])\n", + " return df\n", + " \n", " no_patient = 1\n", " print(user_input)\n", " fitbit_df = readcsv(user_input)\n", @@ -207,7 +241,7 @@ "metadata": {}, "outputs": [], "source": [ - "def garmin(user_input, process_df, output, watch_choice):\n", + "def garmin(user_input, process_df, output, col_names, category_list, watch_choice):\n", " \n", " garmin_df = rowingdata.TCXParser(user_input)\n", " garmin_df.write_csv(\"garmin_df.csv\")\n", @@ -218,7 +252,12 @@ " df = add_time(df)\n", " df = rename_cols(df)\n", " df = df.drop('index', axis = 1)\n", - " return df\n", + " normalize_df = normalize(df, col_names)\n", + " standardize_df = standardize(df, col_names)\n", + " binary_one_hot_encode = oneHotEncode(df, category_list)\n", + " maxAbsScale_df = minMaxScaler(df, col_names)\n", + " standardScale_df = maxAbsScaler(df, col_names)\n", + " return df, normalize_df, standardize_df, binary_one_hot_encode, maxAbsScale_df, standardScale_df\n", " \n", " def add_time(df):\n", " df['Actual_time'] = df['TimeStamp_(sec)'].apply(lambda a : datetime.datetime.utcfromtimestamp(a).strftime('%Y-%m-%d %H:%M:%S'))\n", @@ -235,6 +274,35 @@ " 'Stroke500mpace_(sec/500m)' : 'Stroke_500m_pace_(sec/500m)',\n", " })\n", " return df\n", + "\n", + " def normalize(df, column_names):\n", + " for column in column in column_names:\n", + " scaler = preprocessing.normalize()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def standardize(df, column_names):\n", + " for column in column_names:\n", + " scaler = preprocessing.StandardScaler()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def oneHotEncode(df, category_list):\n", + " encoder = preprocessing.OneHotEncoder(handle_unknown = 'ignore')\n", + " encoder.fit(df)\n", + " return encoder.transform(category_list).toarray()\n", + " \n", + " def minMaxScaler(df, column_names):\n", + " for column in column_names:\n", + " MinMaxScaler = preprocessing.MinMaxScaler()\n", + " df[column] = MinMaxScaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def maxAbsScaler(df, column_names):\n", + " for column in column_names:\n", + " maxAbsScaler = preprocessing.MaxAbsScaler()\n", + " df[column] = maxAbsScaler.fit_transform(df[column])\n", + " return df\n", " \n", " no_patient = 1\n", " df = main(garmin_df, watch_choice)\n", @@ -243,18 +311,23 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "def miband(user_input, process_df, output, watch_choice):\n", + "def miband(user_input, process_df, output, col_names, category_list, watch_choice):\n", " mi_df = pd.read_excel(user_input)\n", " \n", " def main(df, watch_choice):\n", " df = process_df(df, watch_choice)\n", " df = add_time(df)\n", " df = rename_cols(df)\n", - " return df\n", + " normalize_df = normalize(df, col_names)\n", + " standardize_df = standardize(df, col_names)\n", + " binary_one_hot_encode = oneHotEncode(df, category_list)\n", + " maxAbsScale_df = minMaxScaler(df, col_names)\n", + " standardScale_df = maxAbsScaler(df, col_names)\n", + " return df, normalize_df, standardize_df, binary_one_hot_encode, maxAbsScale_df, standardScale_df\n", " \n", " def add_time(df):\n", " df['Actual_time'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])\n", @@ -268,6 +341,35 @@ " df = df.rename(columns ={'Heart_rate' : 'Rate_(beats_per_minute)'})\n", " return df\n", " \n", + " def normalize(df, column_names):\n", + " for column in column in column_names:\n", + " scaler = preprocessing.normalize()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def standardize(df, column_names):\n", + " for column in column_names:\n", + " scaler = preprocessing.StandardScaler()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def oneHotEncode(df, category_list):\n", + " encoder = preprocessing.OneHotEncoder(handle_unknown = 'ignore')\n", + " encoder.fit(df)\n", + " return encoder.transform(category_list).toarray()\n", + " \n", + " def minMaxScaler(df, column_names):\n", + " for column in column_names:\n", + " MinMaxScaler = preprocessing.MinMaxScaler()\n", + " df[column] = MinMaxScaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def maxAbsScaler(df, column_names):\n", + " for column in column_names:\n", + " maxAbsScaler = preprocessing.MaxAbsScaler()\n", + " df[column] = maxAbsScaler.fit_transform(df[column])\n", + " return df\n", + " \n", " no_patient = 1\n", " df = main(mi_df, watch_choice)\n", " output(df, no_patient, watch_choice)" @@ -279,7 +381,7 @@ "metadata": {}, "outputs": [], "source": [ - "def ecg(user_input, process_df, output, watch_choice):\n", + "def ecg(user_input, process_df, output, col_names, category_list, watch_choice):\n", " data = mne.io.read_raw_edf(user_input)\n", " raw_data = data.get_data()\n", " info = data.info\n", @@ -298,7 +400,12 @@ " df = process_df(df, watch_choice)\n", " df = add_time(df, info)\n", " df = rename_cols(df)\n", - " return df\n", + " normalize_df = normalize(df, col_names)\n", + " standardize_df = standardize(df, col_names)\n", + " binary_one_hot_encode = oneHotEncode(df, category_list)\n", + " maxAbsScale_df = minMaxScaler(df, col_names)\n", + " standardScale_df = maxAbsScaler(df, col_names)\n", + " return df, normalize_df, standardize_df, binary_one_hot_encode, maxAbsScale_df, standardScale_df\n", " \n", " def pre_process_ECG(df):\n", " df = df.drop('Marker', axis = 1)\n", @@ -323,6 +430,36 @@ " df = df.rename(columns ={'#_ecg' : 'ECG_(mV)'})\n", " return df\n", " \n", + " def normalize(df, column_names):\n", + " for column in column in column_names:\n", + " scaler = preprocessing.normalize()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def standardize(df, column_names):\n", + " for column in column_names:\n", + " scaler = preprocessing.StandardScaler()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def oneHotEncode(df, category_list):\n", + " encoder = preprocessing.OneHotEncoder(handle_unknown = 'ignore')\n", + " encoder.fit(df)\n", + " return encoder.transform(category_list).toarray()\n", + " \n", + " def minMaxScaler(df, column_names):\n", + " for column in column_names:\n", + " MinMaxScaler = preprocessing.MinMaxScaler()\n", + " df[column] = MinMaxScaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def maxAbsScaler(df, column_names):\n", + " for column in column_names:\n", + " maxAbsScaler = preprocessing.MaxAbsScaler()\n", + " df[column] = maxAbsScaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " \n", " no_patient = 1\n", " df, info = get_data(data)\n", " df = main(info, df, watch_choice)\n", @@ -335,7 +472,7 @@ "metadata": {}, "outputs": [], "source": [ - "def biovotion(user_input, device_id, get_filenames, output, watch_choice):\n", + "def biovotion(user_input, device_id, get_filenames, output, col_names, category_list, watch_choice):\n", " \n", " def main(user_input, device_id, watch_choice):\n", " filenames = get_filenames(user_input)\n", @@ -343,7 +480,12 @@ " dataframes, commoncols = read_data(filenames, colnames)\n", " df = create_df_final(dataframes, commoncols)\n", " df = add_time(df)\n", - " return df\n", + " normalize_df = normalize(df, col_names)\n", + " standardize_df = standardize(df, col_names)\n", + " binary_one_hot_encode = oneHotEncode(df, category_list)\n", + " maxAbsScale_df = minMaxScaler(df, col_names)\n", + " standardScale_df = maxAbsScaler(df, col_names)\n", + " return df, normalize_df, standardize_df, binary_one_hot_encode, maxAbsScale_df, standardScale_df\n", " \n", "# def get_filenames(user_input):\n", "# files = os.listdir(user_input) \n", @@ -394,6 +536,35 @@ " df['Elapsed_time_(sec)'].iloc[i] = (datetime.datetime.strptime(df['Actual_time'][i],'%Y-%m-%d %H:%M:%S')- datetime.datetime.strptime(start_time,'%Y-%m-%d %H:%M:%S')).total_seconds()\n", " return df\n", " \n", + " def normalize(df, column_names):\n", + " for column in column in column_names:\n", + " scaler = preprocessing.normalize()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def standardize(df, column_names):\n", + " for column in column_names:\n", + " scaler = preprocessing.StandardScaler()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def oneHotEncode(df, category_list):\n", + " encoder = preprocessing.OneHotEncoder(handle_unknown = 'ignore')\n", + " encoder.fit(df)\n", + " return encoder.transform(category_list).toarray()\n", + " \n", + " def minMaxScaler(df, column_names):\n", + " for column in column_names:\n", + " MinMaxScaler = preprocessing.MinMaxScaler()\n", + " df[column] = MinMaxScaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def maxAbsScaler(df, column_names):\n", + " for column in column_names:\n", + " maxAbsScaler = preprocessing.MaxAbsScaler()\n", + " df[column] = maxAbsScaler.fit_transform(df[column])\n", + " return df\n", + " \n", " no_patient = 1\n", " df = main(user_input, device_id, watch_choice)\n", " output(df, no_patient, watch_choice)" @@ -500,12 +671,46 @@ " df = pd.concat(dataframes, axis = 1)\n", " return df\n", "\n", - " def main(user_input, preprocess_empatica, add_time_empatica, all_dfs, watch_choice):\n", + " def main(user_input, preprocess_empatica, add_time_empatica, all_dfs, col_names, category_list, watch_choice):\n", " filenames = get_filenames(user_input)\n", " dataframes = read_data(user_input, filenames, preprocess_empatica, add_time_empatica)\n", " df = all_dfs(dataframes)\n", " df['Watch_type'] = watch_choice\n", - " return df \n", + " normalize_df = normalize(df, col_names)\n", + " standardize_df = standardize(df, col_names)\n", + " binary_one_hot_encode = oneHotEncode(df, category_list)\n", + " maxAbsScale_df = minMaxScaler(df, col_names)\n", + " standardScale_df = maxAbsScaler(df, col_names)\n", + " return df, normalize_df, standardize_df, binary_one_hot_encode, maxAbsScale_df, standardScale_df\n", + "\n", + " def normalize(df, column_names):\n", + " for column in column in column_names:\n", + " scaler = preprocessing.normalize()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def standardize(df, column_names):\n", + " for column in column_names:\n", + " scaler = preprocessing.StandardScaler()\n", + " df[column] = scaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def oneHotEncode(df, category_list):\n", + " encoder = preprocessing.OneHotEncoder(handle_unknown = 'ignore')\n", + " encoder.fit(df)\n", + " return encoder.transform(category_list).toarray()\n", + " \n", + " def minMaxScaler(df, column_names):\n", + " for column in column_names:\n", + " MinMaxScaler = preprocessing.MinMaxScaler()\n", + " df[column] = MinMaxScaler.fit_transform(df[column])\n", + " return df\n", + " \n", + " def maxAbsScaler(df, column_names):\n", + " for column in column_names:\n", + " maxAbsScaler = preprocessing.MaxAbsScaler()\n", + " df[column] = maxAbsScaler.fit_transform(df[column])\n", + " return df\n", " \n", " no_patient = 1\n", " df = main(user_input, preprocess_empatica, add_time_empatica, all_dfs, watch_choice)\n", diff --git a/DARREN_CHLOE_README.md b/DARREN_CHLOE_README.md index 7695e41..291a965 100644 --- a/DARREN_CHLOE_README.md +++ b/DARREN_CHLOE_README.md @@ -9,47 +9,55 @@ Here is a list of new functions (modules) added to the pre-processing pipeline f # standardize(df, col_names) -Params: -Output: +Params: df (dataframe containing the data), col_names (selected columns to be transformed) +Output: dataframe with scaled, transformed columns + +Description: Standardizing features/data by removing the mean and scaling to unit variance. +https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html -Description: 2. Normalization # normalize(df, col_names) -Params: -Output: +Params: df (dataframe containing the data), col_names (selected columns to be transformed) +Output: dataframe with normalized, transformed columns -Description: +Description: Scale input vectors individually to unit norm (vector length). +https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html 3. One Hot Encoding # oneHotEncode(df, category_List) -Params: -Output: +Params: df (dataframe containing the data), category_list (categories expected for the dataframe) +Output: one-hot numeric array -Description: +Note: if category_list is empty, then the function will automatically derive the categories based on the unique values of each "feature". + +Description: Encode categorical features as a one-hot numeric array. The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka ‘one-of-K’ or ‘dummy’) encoding scheme. This creates a binary column for each category and returns a sparse matrix or dense array (depending on the sparse parameter). +https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html 4. MinMax Scaling # minMaxScaler(df, column_names) -Params: -Output: +Params: df (dataframe containing the data), col_names (selected columns to be transformed) +Output: dataframe with scaled, transformed columns -Description +Description: Transform features by scaling each feature to a given range. This estimator scales and translates each feature individually such that it is in the given range on the training set, e.g. between zero and one. +https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html 5. Standard Scaling # StandardScaler(df, column_names) -Params: -Output: +Params: df (dataframe containing the data), col_names (selected columns to be transformed) +Output: dataframe with scaled, transformed columns -Description +Description: Scale each feature by its maximum absolute value. This function scales and translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity. +https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html 6. Data type conversions diff --git a/apple.csv b/apple.csv new file mode 100644 index 0000000..2d07fd5 --- /dev/null +++ b/apple.csv @@ -0,0 +1,108 @@ +, Rate (beats per minute) +5,73 +6,92 +7,94 +8,97 +9,98 +10,102 +11,102 +12,102 +13,102 +14,105 +15,105 +16,103 +17,104 +18,96 +19,97 +20,101 +21,100 +22,101 +23,103 +24,103 +25,108 +26,107 +27,103 +28,103 +29,101 +30,101 +31,102 +32,106 +33,106 +34,106 +35,107 +36,106 +37,106 +38,104 +39,99 +40,101 +41,102 +42,103 +43,106 +44,106 +45,103 +46,104 +47,105 +48,106 +49,107 +50,108 +51,106 +52,104 +53,104 +54,105 +55,101 +56,105 +57,104 +58,104 +59,104 +60,104 +61,105 +62,101 +63,105 +64,104 +65,98 +66,97 +67,95 +68,95 +69,96 +70,96 +71,97 +72,81 +73,72 +74,73 +75,72 +76,72 +77,73 +78,70 +79,70 +80,72 +81,72 +82,66 +83,69 +84,71 +85,64 +86,65 +87,75 +88,73 +89,69 +90,72 +91,67 +92,66 +93,65 +94,61 +95,60 +96,68 +97,72 +98,67 +99,65 +100,64 +101,64 +102,64 +103,65 +104,64 +105,64 +106,66 +107,66 +108,65 +109,74 +110,73 +111,67 diff --git a/case_study.ipynb b/case_study.ipynb deleted file mode 100644 index e69de29..0000000