diff --git a/file.py b/file.py new file mode 100644 index 0000000..08d3d78 --- /dev/null +++ b/file.py @@ -0,0 +1,349 @@ +import os, random + +import numpy as np +from scipy.stats import chi2_contingency +import pandas as pd + +import seaborn as sns +import matplotlib.pyplot as plt + +import streamlit as st + +from sklearn.model_selection import cross_val_predict +from sklearn.model_selection import cross_val_score +from sklearn.neighbors import KNeighborsClassifier +from sklearn.linear_model import RidgeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.naive_bayes import GaussianNB + +@st.cache +def load_data(): + df = pd.read_csv('heart_failure_clinical_records_dataset.csv') + new_df = df.drop_duplicates() + + to_cast = list(filter(lambda c: c if len(new_df[c].unique()) <= 5 else None, + new_df.columns)) + for col in to_cast: + new_df[col] = new_df[col].astype('category') + return df, new_df + + +def correlation_quant(df): + fig, ax = plt.subplots(figsize=(10, 8)) + sns.heatmap(data=df.astype({'DEATH_EVENT': 'int64'}).corr(), + annot=True, cmap='Spectral', cbar_kws={'aspect': 50}, + square=True, ax=ax) + plt.xticks(rotation=30, ha='right'); + plt.tight_layout() + st.write(fig) + +def cramers_corrected_stat(contingency_table): + + try: + chi2 = chi2_contingency(contingency_table)[0] + except ValueError: + return np.NaN + + n = contingency_table.sum().sum() + phi2 = chi2/n + + r, k = contingency_table.shape + r_corrected = r - (((r-1)**2)/(n-1)) + k_corrected = k - (((k-1)**2)/(n-1)) + phi2_corrected = max(0, phi2 - ((k-1)*(r-1))/(n-1)) + + return (phi2_corrected / min( (k_corrected-1), (r_corrected-1)))**0.5 + +def categorical_corr_matrix(df): + + df = df.select_dtypes(include='category') + cols = df.columns + n = len(cols) + corr_matrix = pd.DataFrame(np.zeros(shape=(n, n)), index=cols, columns=cols) + + excluded_cols = list() + + for col1 in cols: + for col2 in cols: + if col1 == col2: + corr_matrix.loc[col1, col2] = 1 + break + df_crosstab = pd.crosstab(df[col1], df[col2], dropna=False) + corr_matrix.loc[col1, col2] = cramers_corrected_stat(df_crosstab) + + corr_matrix += np.tril(corr_matrix, k=-1).T + return corr_matrix + + +def correlation_categorical(df): + fig, ax = plt.subplots(figsize=(11, 5)) + sns.heatmap(categorical_corr_matrix(df), annot=True, cmap='Spectral', + cbar_kws={'aspect': 50}, square=True, ax=ax) + plt.xticks(rotation=30, ha='right'); + plt.tight_layout() + st.write(fig) + + +def visualization_categorical(df): + + fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(13, 11)) + titles = list(df.select_dtypes(include='category')) + + ax_title_pairs = zip(axs.flat, titles) + + for ax, title in ax_title_pairs: + sns.countplot(x=title, data=df, palette='Pastel1', ax=ax) + + plt.subplots_adjust(left=0.1, + bottom=0.1, + right=0.9, + top=0.9, + wspace=0.6, + hspace=0.8) + + st.write(fig) + + + +def visualization_continuous(df): + + df_grouped = df.groupby(by='DEATH_EVENT') + fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(12, 11)) + titles = list(df.select_dtypes(exclude='category')) + + ax_title_pairs = zip(axs.flat, titles) + + for ax, title in ax_title_pairs: + sns.distplot(df_grouped.get_group(0)[title], bins=10, ax=ax, label='No') + sns.distplot(df_grouped.get_group(1)[title], bins=10, ax=ax, label='Yes') + ax.legend(title='DEATH_EVENT') + + axs.flat[-1].remove() + + plt.subplots_adjust(left=0.1, + bottom=0.1, + right=0.9, + top=0.9, + wspace=0.6, + hspace=0.8) + + st.write(fig) + + +def return_categoricals(df, threshold=5): + + return list(filter(lambda c: c if len(df[c].unique()) <= threshold else None, + df.columns)) + + +def to_categorical(columns, df): + for col in columns: + df[col] = df[col].astype('category') + return df + + +def modified_countplot(**kargs): + """ + Assumes that columns to be plotted are in of pandas dtype='CategoricalDtype' + """ + facet_gen = kargs['facet_generator'] ## Facet generator over facet data + curr_facet, facet_data = None, None + + while True: + ## Keep yielding until non-empty dataframe is found + curr_facet = next(facet_gen) ## Yielding facet genenrator + df_rows = curr_facet[1].shape[0] + + ## Skip the current facet if its corresponding dataframe empty + if df_rows: + facet_data = curr_facet[1] + break + + x_hue = (kargs.get('x'), kargs.get('hue')) + cols = [col for col in x_hue if col] + col_categories = [facet_data[col].dtype.categories if col else None for col in x_hue] + + palette = kargs['palette'] if 'palette1' in kargs.keys() else 'Pastel1' + sns.countplot(x=cols[0], hue=x_hue[1], + order=col_categories[0], hue_order=col_categories[1], + data=facet_data.loc[:, cols], palette=palette) + + +def smoking_blood(df, modified_countplot): + + facet = sns.FacetGrid(df, row='smoking', col='sex', sharex=False, + sharey=False, margin_titles=True) + facet.map(modified_countplot, x='high_blood_pressure', hue='DEATH_EVENT', + palette='Pastel2', facet_generator=facet.facet_data()) + facet.set_xlabels('high_blood_pressure') + facet.set_ylabels('Count') + facet.add_legend(title='DEATH_EVENT') + st.set_option('deprecation.showPyplotGlobalUse', False) + st.pyplot() + + +def diabetes_anemia(df, modified_countplot): + + facet = sns.FacetGrid(df, row='diabetes', col='sex', sharex=False, + sharey=False, margin_titles=True) + facet.map(modified_countplot, x='anaemia', hue='DEATH_EVENT', + palette='Pastel2', facet_generator=facet.facet_data()) + facet.set_xlabels('anaemia') + facet.set_ylabels('Count') + facet.add_legend(title='DEATH_EVENT') + st.pyplot() + + + +def knn(x, y): + + knn = KNeighborsClassifier(weights='distance') + cv_scores = cross_val_score(knn, x, y, cv=5) + st.write('mean validation accuracy for K Nearest Neighbors: ', np.mean(cv_scores)) + +def ridge(x, y): + ridge = RidgeClassifier() + cv_scores = cross_val_score(ridge, x, y, cv=5) + st.write('mean validation accuracy for Ridge Regression: ', np.mean(cv_scores)) + + +def random_forest(x, y): + rf = RandomForestClassifier(max_depth=4, criterion='entropy',class_weight = 'balanced') + cv_scores = cross_val_score(rf, x, y, cv=5) + st.write('mean validation accuracy for Random Forest: ', np.mean(cv_scores)) + +def mlp(x, y): + mlp = MLPClassifier(random_state=0, max_iter=1000, early_stopping=True) + cv_scores = cross_val_score(mlp, x,y, cv=5) + st.write('mean validation accuracy for MLP: ', np.mean(cv_scores)) + +def gassianNB(x, y): + gau_nb = GaussianNB() + cv_scores = cross_val_score(gau_nb, x, y, cv=5) + st.write('mean validation accuracy for GaussianNB: ', np.mean(cv_scores)) + + + + +df, new_df = load_data() + +st.title("Heart Failure Prediction") + +st.write("Cardiovascular diseases (CVDs) are ranked to have the highest death rate globally, which takes about 17.9 millions of lives annually and accounts for about 31% of all deaths worldwide.
Heart failure is a common symptom of CVDs, which brings serious consequences, such as death.") + +st.write("This clinical dataset is from Kaggle (https://www.kaggle.com/andrewmvd/heart-failure-clinical-data) and contains 12 features, 300 rows of data, which can be used to predict mortality by heart failure. Most CVDs can be prevented by addressing behaviral risk factors such as smoking, obesity, lack of physical, alcohol, etc. People with or with high CVDs risk need early detection, and machine learning models might be a good choice.") + + +# Part 1: Overview on yelp covid features +st.markdown("## 1. Heart Failure Overview") +st.write("Let's first look at the raw dataframe from the original dataset.") + +st.dataframe(df.head()) + +st.write("To avoid any confusions, note that **time** suggests follow-up period (days) and **ejection_fraction** suggests percentage of blood leaving the heart at each contraction. ") + +st.write("After clear inspection, this dataset does not have any missing values. The 12 features can be splitted into two categoreis: quantitative and categorical. I further changed the categorical variables as type: categorical.") +st.markdown("- Quantitative: **age**, **creatinine_phosphokinase**, **ejaction_fraction**, **platelets**, **serum_creatinine**, **serum_sodium**, **time**") +st.markdown("- Categorical: **anaemia**, **diabetes**, **high_blood_pressure**, **sex**, **smoking**, **DEATH_EVENT**") + + +description = st.checkbox('Wanna check the statistics for quantitative variables?') +if description: + st.dataframe(df.describe().T) + +# Part 2: Data Correlation +st.markdown("## 2. Corellation Visualization") + +st.write("You can choose quantitative or categorical values to inspect the correclation matrix. Since we want to predict mortality, I included DEATH_EVENT in both cases.") +choices = st.multiselect( + 'Quantitative or Categorical?', + ('Quantitative', 'Categorical') +) + +if 'Quantitative' in choices: + correlation_quant(new_df) + st.write("For quantitative variables, I used Pearson correlation. We can see that time (folllow-up period), serum_creatinine, ejaction_fraction, and are are the four most correlated factors. ") + + + +if 'Categorical' in choices: + correlation_categorical(new_df) + st.write("For categorical variables, I used Cramer's V correlation. We can see that smoking and and high_blood_pressure are two most correlated factors. ") + + +#Part 3: Data Visualization +st.markdown("## 3. Data Visualization") + + +st.markdown("### 3.1 How long do they plan to close") + +st.write("You can choose quantitative or categorical values to see the its distribution.") +choices = st.multiselect( + 'Quantitative or Categorical?', + ('Quantitative Variables', 'Categorical Variables') +) + +if 'Quantitative Variables' in choices: + visualization_continuous(new_df) + st.write("For continuous variables, I seperated the data by DEATH_EVENT. We can see that for serum_sodium, ejaction_fraction, and density, the distribution are quite different: the mean are apprantly different. ") + + +if 'Categorical Variables' in choices: + visualization_categorical(new_df) + st.write("For categorical variables, we can see that there is some imbalance of data: for DEATH_EVENT, there are about 200 people dead but only 100 alive. Anaemia and diabetes relatively equat number of data. For other factors, the count between two categories are off by about half. A very interesting thing to notice is that diabetes, sex, and smoking seem to be uncorrelated to DEATH_EVENT at all.") + +to_cast = return_categoricals(df, threshold=5) +df = to_categorical(to_cast, df) + + +st.markdown("### 3.2 How does gender influence heart failure based on unhealthy habits?") + +smoking = st.checkbox('Wanna see how does heart failure among gender based on smoking and blood pressure?') +if smoking: + smoking_blood(df, modified_countplot) + st.write("We can see that combining smoking and high blood pressure make many male die from heart failure, while not creating a significant case for female. Gender does make some interesing contrasts between some variables. ") + +anemia = st.checkbox('Wanna see how does heart failure among gender based on anemia and diabetes?') +if anemia: + diabetes_anemia(df, modified_countplot) + st.write("We don't notice anyting much significant here, but it's interesting that having both of diabetes and anaemia for male have a higher chance of dying from heart failure than female. Gender does make some interesing contrasts between some variables. ") + +#Part 4: ML models +st.markdown("## 4. Machine Learning Models") + +x = df.iloc[:, :-1] +y = df['DEATH_EVENT'] + +st.write("Now, we would like to explore which machine learning model can predict mortality by heart failure the best. You can choose the following ML models to inspect the performances. You can also choose different combinations of variables to compare their performances. We used cross validations for all five models and computed the average accuracy: 0.8 for traning, and 0.2 for validation. This is a simple binary classification problem. To deal with the data imbalance problem, I used upsampling method. ") + +total_targets = x.columns[0:] +label = st.multiselect('Select the variables that you would like to explore.', total_targets) + +x = x[label] + + +choices = st.multiselect( + 'ML Model to choose: ', + ('KNN', + 'Ridge Regression', + 'Random Forest', + 'MLP', + 'Gaussian NB')) + +if 'KNN' in choices: + knn(x, y) + +if 'Ridge Regression' in choices: + ridge(x, y) + +if 'Random Forest' in choices: + random_forest(x, y) + +if 'MLP' in choices: + mlp(x, y) + +if 'Gaussian NB' in choices: + gassianNB(x, y) + + diff --git a/heart_failure_clinical_records_dataset.csv b/heart_failure_clinical_records_dataset.csv new file mode 100644 index 0000000..915ecc5 --- /dev/null +++ b/heart_failure_clinical_records_dataset.csv @@ -0,0 +1,300 @@ +age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT +75,0,582,0,20,1,265000,1.9,130,1,0,4,1 +55,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1 +65,0,146,0,20,0,162000,1.3,129,1,1,7,1 +50,1,111,0,20,0,210000,1.9,137,1,0,7,1 +65,1,160,1,20,0,327000,2.7,116,0,0,8,1 +90,1,47,0,40,1,204000,2.1,132,1,1,8,1 +75,1,246,0,15,0,127000,1.2,137,1,0,10,1 +60,1,315,1,60,0,454000,1.1,131,1,1,10,1 +65,0,157,0,65,0,263358.03,1.5,138,0,0,10,1 +80,1,123,0,35,1,388000,9.4,133,1,1,10,1 +75,1,81,0,38,1,368000,4,131,1,1,10,1 +62,0,231,0,25,1,253000,0.9,140,1,1,10,1 +45,1,981,0,30,0,136000,1.1,137,1,0,11,1 +50,1,168,0,38,1,276000,1.1,137,1,0,11,1 +49,1,80,0,30,1,427000,1,138,0,0,12,0 +82,1,379,0,50,0,47000,1.3,136,1,0,13,1 +87,1,149,0,38,0,262000,0.9,140,1,0,14,1 +45,0,582,0,14,0,166000,0.8,127,1,0,14,1 +70,1,125,0,25,1,237000,1,140,0,0,15,1 +48,1,582,1,55,0,87000,1.9,121,0,0,15,1 +65,1,52,0,25,1,276000,1.3,137,0,0,16,0 +65,1,128,1,30,1,297000,1.6,136,0,0,20,1 +68,1,220,0,35,1,289000,0.9,140,1,1,20,1 +53,0,63,1,60,0,368000,0.8,135,1,0,22,0 +75,0,582,1,30,1,263358.03,1.83,134,0,0,23,1 +80,0,148,1,38,0,149000,1.9,144,1,1,23,1 +95,1,112,0,40,1,196000,1,138,0,0,24,1 +70,0,122,1,45,1,284000,1.3,136,1,1,26,1 +58,1,60,0,38,0,153000,5.8,134,1,0,26,1 +82,0,70,1,30,0,200000,1.2,132,1,1,26,1 +94,0,582,1,38,1,263358.03,1.83,134,1,0,27,1 +85,0,23,0,45,0,360000,3,132,1,0,28,1 +50,1,249,1,35,1,319000,1,128,0,0,28,1 +50,1,159,1,30,0,302000,1.2,138,0,0,29,0 +65,0,94,1,50,1,188000,1,140,1,0,29,1 +69,0,582,1,35,0,228000,3.5,134,1,0,30,1 +90,1,60,1,50,0,226000,1,134,1,0,30,1 +82,1,855,1,50,1,321000,1,145,0,0,30,1 +60,0,2656,1,30,0,305000,2.3,137,1,0,30,0 +60,0,235,1,38,0,329000,3,142,0,0,30,1 +70,0,582,0,20,1,263358.03,1.83,134,1,1,31,1 +50,0,124,1,30,1,153000,1.2,136,0,1,32,1 +70,0,571,1,45,1,185000,1.2,139,1,1,33,1 +72,0,127,1,50,1,218000,1,134,1,0,33,0 +60,1,588,1,60,0,194000,1.1,142,0,0,33,1 +50,0,582,1,38,0,310000,1.9,135,1,1,35,1 +51,0,1380,0,25,1,271000,0.9,130,1,0,38,1 +60,0,582,1,38,1,451000,0.6,138,1,1,40,1 +80,1,553,0,20,1,140000,4.4,133,1,0,41,1 +57,1,129,0,30,0,395000,1,140,0,0,42,1 +68,1,577,0,25,1,166000,1,138,1,0,43,1 +53,1,91,0,20,1,418000,1.4,139,0,0,43,1 +60,0,3964,1,62,0,263358.03,6.8,146,0,0,43,1 +70,1,69,1,50,1,351000,1,134,0,0,44,1 +60,1,260,1,38,0,255000,2.2,132,0,1,45,1 +95,1,371,0,30,0,461000,2,132,1,0,50,1 +70,1,75,0,35,0,223000,2.7,138,1,1,54,0 +60,1,607,0,40,0,216000,0.6,138,1,1,54,0 +49,0,789,0,20,1,319000,1.1,136,1,1,55,1 +72,0,364,1,20,1,254000,1.3,136,1,1,59,1 +45,0,7702,1,25,1,390000,1,139,1,0,60,1 +50,0,318,0,40,1,216000,2.3,131,0,0,60,1 +55,0,109,0,35,0,254000,1.1,139,1,1,60,0 +45,0,582,0,35,0,385000,1,145,1,0,61,1 +45,0,582,0,80,0,263358.03,1.18,137,0,0,63,0 +60,0,68,0,20,0,119000,2.9,127,1,1,64,1 +42,1,250,1,15,0,213000,1.3,136,0,0,65,1 +72,1,110,0,25,0,274000,1,140,1,1,65,1 +70,0,161,0,25,0,244000,1.2,142,0,0,66,1 +65,0,113,1,25,0,497000,1.83,135,1,0,67,1 +41,0,148,0,40,0,374000,0.8,140,1,1,68,0 +58,0,582,1,35,0,122000,0.9,139,1,1,71,0 +85,0,5882,0,35,0,243000,1,132,1,1,72,1 +65,0,224,1,50,0,149000,1.3,137,1,1,72,0 +69,0,582,0,20,0,266000,1.2,134,1,1,73,1 +60,1,47,0,20,0,204000,0.7,139,1,1,73,1 +70,0,92,0,60,1,317000,0.8,140,0,1,74,0 +42,0,102,1,40,0,237000,1.2,140,1,0,74,0 +75,1,203,1,38,1,283000,0.6,131,1,1,74,0 +55,0,336,0,45,1,324000,0.9,140,0,0,74,0 +70,0,69,0,40,0,293000,1.7,136,0,0,75,0 +67,0,582,0,50,0,263358.03,1.18,137,1,1,76,0 +60,1,76,1,25,0,196000,2.5,132,0,0,77,1 +79,1,55,0,50,1,172000,1.8,133,1,0,78,0 +59,1,280,1,25,1,302000,1,141,0,0,78,1 +51,0,78,0,50,0,406000,0.7,140,1,0,79,0 +55,0,47,0,35,1,173000,1.1,137,1,0,79,0 +65,1,68,1,60,1,304000,0.8,140,1,0,79,0 +44,0,84,1,40,1,235000,0.7,139,1,0,79,0 +57,1,115,0,25,1,181000,1.1,144,1,0,79,0 +70,0,66,1,45,0,249000,0.8,136,1,1,80,0 +60,0,897,1,45,0,297000,1,133,1,0,80,0 +42,0,582,0,60,0,263358.03,1.18,137,0,0,82,0 +60,1,154,0,25,0,210000,1.7,135,1,0,82,1 +58,0,144,1,38,1,327000,0.7,142,0,0,83,0 +58,1,133,0,60,1,219000,1,141,1,0,83,0 +63,1,514,1,25,1,254000,1.3,134,1,0,83,0 +70,1,59,0,60,0,255000,1.1,136,0,0,85,0 +60,1,156,1,25,1,318000,1.2,137,0,0,85,0 +63,1,61,1,40,0,221000,1.1,140,0,0,86,0 +65,1,305,0,25,0,298000,1.1,141,1,0,87,0 +75,0,582,0,45,1,263358.03,1.18,137,1,0,87,0 +80,0,898,0,25,0,149000,1.1,144,1,1,87,0 +42,0,5209,0,30,0,226000,1,140,1,1,87,0 +60,0,53,0,50,1,286000,2.3,143,0,0,87,0 +72,1,328,0,30,1,621000,1.7,138,0,1,88,1 +55,0,748,0,45,0,263000,1.3,137,1,0,88,0 +45,1,1876,1,35,0,226000,0.9,138,1,0,88,0 +63,0,936,0,38,0,304000,1.1,133,1,1,88,0 +45,0,292,1,35,0,850000,1.3,142,1,1,88,0 +85,0,129,0,60,0,306000,1.2,132,1,1,90,1 +55,0,60,0,35,0,228000,1.2,135,1,1,90,0 +50,0,369,1,25,0,252000,1.6,136,1,0,90,0 +70,1,143,0,60,0,351000,1.3,137,0,0,90,1 +60,1,754,1,40,1,328000,1.2,126,1,0,91,0 +58,1,400,0,40,0,164000,1,139,0,0,91,0 +60,1,96,1,60,1,271000,0.7,136,0,0,94,0 +85,1,102,0,60,0,507000,3.2,138,0,0,94,0 +65,1,113,1,60,1,203000,0.9,140,0,0,94,0 +86,0,582,0,38,0,263358.03,1.83,134,0,0,95,1 +60,1,737,0,60,1,210000,1.5,135,1,1,95,0 +66,1,68,1,38,1,162000,1,136,0,0,95,0 +60,0,96,1,38,0,228000,0.75,140,0,0,95,0 +60,1,582,0,30,1,127000,0.9,145,0,0,95,0 +60,0,582,0,40,0,217000,3.7,134,1,0,96,1 +43,1,358,0,50,0,237000,1.3,135,0,0,97,0 +46,0,168,1,17,1,271000,2.1,124,0,0,100,1 +58,1,200,1,60,0,300000,0.8,137,0,0,104,0 +61,0,248,0,30,1,267000,0.7,136,1,1,104,0 +53,1,270,1,35,0,227000,3.4,145,1,0,105,0 +53,1,1808,0,60,1,249000,0.7,138,1,1,106,0 +60,1,1082,1,45,0,250000,6.1,131,1,0,107,0 +46,0,719,0,40,1,263358.03,1.18,137,0,0,107,0 +63,0,193,0,60,1,295000,1.3,145,1,1,107,0 +81,0,4540,0,35,0,231000,1.18,137,1,1,107,0 +75,0,582,0,40,0,263358.03,1.18,137,1,0,107,0 +65,1,59,1,60,0,172000,0.9,137,0,0,107,0 +68,1,646,0,25,0,305000,2.1,130,1,0,108,0 +62,0,281,1,35,0,221000,1,136,0,0,108,0 +50,0,1548,0,30,1,211000,0.8,138,1,0,108,0 +80,0,805,0,38,0,263358.03,1.1,134,1,0,109,1 +46,1,291,0,35,0,348000,0.9,140,0,0,109,0 +50,0,482,1,30,0,329000,0.9,132,0,0,109,0 +61,1,84,0,40,1,229000,0.9,141,0,0,110,0 +72,1,943,0,25,1,338000,1.7,139,1,1,111,1 +50,0,185,0,30,0,266000,0.7,141,1,1,112,0 +52,0,132,0,30,0,218000,0.7,136,1,1,112,0 +64,0,1610,0,60,0,242000,1,137,1,0,113,0 +75,1,582,0,30,0,225000,1.83,134,1,0,113,1 +60,0,2261,0,35,1,228000,0.9,136,1,0,115,0 +72,0,233,0,45,1,235000,2.5,135,0,0,115,1 +62,0,30,1,60,1,244000,0.9,139,1,0,117,0 +50,0,115,0,45,1,184000,0.9,134,1,1,118,0 +50,0,1846,1,35,0,263358.03,1.18,137,1,1,119,0 +65,1,335,0,35,1,235000,0.8,136,0,0,120,0 +60,1,231,1,25,0,194000,1.7,140,1,0,120,0 +52,1,58,0,35,0,277000,1.4,136,0,0,120,0 +50,0,250,0,25,0,262000,1,136,1,1,120,0 +85,1,910,0,50,0,235000,1.3,134,1,0,121,0 +59,1,129,0,45,1,362000,1.1,139,1,1,121,0 +66,1,72,0,40,1,242000,1.2,134,1,0,121,0 +45,1,130,0,35,0,174000,0.8,139,1,1,121,0 +63,1,582,0,40,0,448000,0.9,137,1,1,123,0 +50,1,2334,1,35,0,75000,0.9,142,0,0,126,1 +45,0,2442,1,30,0,334000,1.1,139,1,0,129,1 +80,0,776,1,38,1,192000,1.3,135,0,0,130,1 +53,0,196,0,60,0,220000,0.7,133,1,1,134,0 +59,0,66,1,20,0,70000,2.4,134,1,0,135,1 +65,0,582,1,40,0,270000,1,138,0,0,140,0 +70,0,835,0,35,1,305000,0.8,133,0,0,145,0 +51,1,582,1,35,0,263358.03,1.5,136,1,1,145,0 +52,0,3966,0,40,0,325000,0.9,140,1,1,146,0 +70,1,171,0,60,1,176000,1.1,145,1,1,146,0 +50,1,115,0,20,0,189000,0.8,139,1,0,146,0 +65,0,198,1,35,1,281000,0.9,137,1,1,146,0 +60,1,95,0,60,0,337000,1,138,1,1,146,0 +69,0,1419,0,40,0,105000,1,135,1,1,147,0 +49,1,69,0,50,0,132000,1,140,0,0,147,0 +63,1,122,1,60,0,267000,1.2,145,1,0,147,0 +55,0,835,0,40,0,279000,0.7,140,1,1,147,0 +40,0,478,1,30,0,303000,0.9,136,1,0,148,0 +59,1,176,1,25,0,221000,1,136,1,1,150,1 +65,0,395,1,25,0,265000,1.2,136,1,1,154,1 +75,0,99,0,38,1,224000,2.5,134,1,0,162,1 +58,1,145,0,25,0,219000,1.2,137,1,1,170,1 +60.667,1,104,1,30,0,389000,1.5,136,1,0,171,1 +50,0,582,0,50,0,153000,0.6,134,0,0,172,1 +60,0,1896,1,25,0,365000,2.1,144,0,0,172,1 +60.667,1,151,1,40,1,201000,1,136,0,0,172,0 +40,0,244,0,45,1,275000,0.9,140,0,0,174,0 +80,0,582,1,35,0,350000,2.1,134,1,0,174,0 +64,1,62,0,60,0,309000,1.5,135,0,0,174,0 +50,1,121,1,40,0,260000,0.7,130,1,0,175,0 +73,1,231,1,30,0,160000,1.18,142,1,1,180,0 +45,0,582,0,20,1,126000,1.6,135,1,0,180,1 +77,1,418,0,45,0,223000,1.8,145,1,0,180,1 +45,0,582,1,38,1,263358.03,1.18,137,0,0,185,0 +65,0,167,0,30,0,259000,0.8,138,0,0,186,0 +50,1,582,1,20,1,279000,1,134,0,0,186,0 +60,0,1211,1,35,0,263358.03,1.8,113,1,1,186,0 +63,1,1767,0,45,0,73000,0.7,137,1,0,186,0 +45,0,308,1,60,1,377000,1,136,1,0,186,0 +70,0,97,0,60,1,220000,0.9,138,1,0,186,0 +60,0,59,0,25,1,212000,3.5,136,1,1,187,0 +78,1,64,0,40,0,277000,0.7,137,1,1,187,0 +50,1,167,1,45,0,362000,1,136,0,0,187,0 +40,1,101,0,40,0,226000,0.8,141,0,0,187,0 +85,0,212,0,38,0,186000,0.9,136,1,0,187,0 +60,1,2281,1,40,0,283000,1,141,0,0,187,0 +49,0,972,1,35,1,268000,0.8,130,0,0,187,0 +70,0,212,1,17,1,389000,1,136,1,1,188,0 +50,0,582,0,62,1,147000,0.8,140,1,1,192,0 +78,0,224,0,50,0,481000,1.4,138,1,1,192,0 +48,1,131,1,30,1,244000,1.6,130,0,0,193,1 +65,1,135,0,35,1,290000,0.8,134,1,0,194,0 +73,0,582,0,35,1,203000,1.3,134,1,0,195,0 +70,0,1202,0,50,1,358000,0.9,141,0,0,196,0 +54,1,427,0,70,1,151000,9,137,0,0,196,1 +68,1,1021,1,35,0,271000,1.1,134,1,0,197,0 +55,0,582,1,35,1,371000,0.7,140,0,0,197,0 +73,0,582,0,20,0,263358.03,1.83,134,1,0,198,1 +65,0,118,0,50,0,194000,1.1,145,1,1,200,0 +42,1,86,0,35,0,365000,1.1,139,1,1,201,0 +47,0,582,0,25,0,130000,0.8,134,1,0,201,0 +58,0,582,1,25,0,504000,1,138,1,0,205,0 +75,0,675,1,60,0,265000,1.4,125,0,0,205,0 +58,1,57,0,25,0,189000,1.3,132,1,1,205,0 +55,1,2794,0,35,1,141000,1,140,1,0,206,0 +65,0,56,0,25,0,237000,5,130,0,0,207,0 +72,0,211,0,25,0,274000,1.2,134,0,0,207,0 +60,0,166,0,30,0,62000,1.7,127,0,0,207,1 +70,0,93,0,35,0,185000,1.1,134,1,1,208,0 +40,1,129,0,35,0,255000,0.9,137,1,0,209,0 +53,1,707,0,38,0,330000,1.4,137,1,1,209,0 +53,1,582,0,45,0,305000,1.1,137,1,1,209,0 +77,1,109,0,50,1,406000,1.1,137,1,0,209,0 +75,0,119,0,50,1,248000,1.1,148,1,0,209,0 +70,0,232,0,30,0,173000,1.2,132,1,0,210,0 +65,1,720,1,40,0,257000,1,136,0,0,210,0 +55,1,180,0,45,0,263358.03,1.18,137,1,1,211,0 +70,0,81,1,35,1,533000,1.3,139,0,0,212,0 +65,0,582,1,30,0,249000,1.3,136,1,1,212,0 +40,0,90,0,35,0,255000,1.1,136,1,1,212,0 +73,1,1185,0,40,1,220000,0.9,141,0,0,213,0 +54,0,582,1,38,0,264000,1.8,134,1,0,213,0 +61,1,80,1,38,0,282000,1.4,137,1,0,213,0 +55,0,2017,0,25,0,314000,1.1,138,1,0,214,1 +64,0,143,0,25,0,246000,2.4,135,1,0,214,0 +40,0,624,0,35,0,301000,1,142,1,1,214,0 +53,0,207,1,40,0,223000,1.2,130,0,0,214,0 +50,0,2522,0,30,1,404000,0.5,139,0,0,214,0 +55,0,572,1,35,0,231000,0.8,143,0,0,215,0 +50,0,245,0,45,1,274000,1,133,1,0,215,0 +70,0,88,1,35,1,236000,1.2,132,0,0,215,0 +53,1,446,0,60,1,263358.03,1,139,1,0,215,0 +52,1,191,1,30,1,334000,1,142,1,1,216,0 +65,0,326,0,38,0,294000,1.7,139,0,0,220,0 +58,0,132,1,38,1,253000,1,139,1,0,230,0 +45,1,66,1,25,0,233000,0.8,135,1,0,230,0 +53,0,56,0,50,0,308000,0.7,135,1,1,231,0 +55,0,66,0,40,0,203000,1,138,1,0,233,0 +62,1,655,0,40,0,283000,0.7,133,0,0,233,0 +65,1,258,1,25,0,198000,1.4,129,1,0,235,1 +68,1,157,1,60,0,208000,1,140,0,0,237,0 +61,0,582,1,38,0,147000,1.2,141,1,0,237,0 +50,1,298,0,35,0,362000,0.9,140,1,1,240,0 +55,0,1199,0,20,0,263358.03,1.83,134,1,1,241,1 +56,1,135,1,38,0,133000,1.7,140,1,0,244,0 +45,0,582,1,38,0,302000,0.9,140,0,0,244,0 +40,0,582,1,35,0,222000,1,132,1,0,244,0 +44,0,582,1,30,1,263358.03,1.6,130,1,1,244,0 +51,0,582,1,40,0,221000,0.9,134,0,0,244,0 +67,0,213,0,38,0,215000,1.2,133,0,0,245,0 +42,0,64,0,40,0,189000,0.7,140,1,0,245,0 +60,1,257,1,30,0,150000,1,137,1,1,245,0 +45,0,582,0,38,1,422000,0.8,137,0,0,245,0 +70,0,618,0,35,0,327000,1.1,142,0,0,245,0 +70,0,582,1,38,0,25100,1.1,140,1,0,246,0 +50,1,1051,1,30,0,232000,0.7,136,0,0,246,0 +55,0,84,1,38,0,451000,1.3,136,0,0,246,0 +70,0,2695,1,40,0,241000,1,137,1,0,247,0 +70,0,582,0,40,0,51000,2.7,136,1,1,250,0 +42,0,64,0,30,0,215000,3.8,128,1,1,250,0 +65,0,1688,0,38,0,263358.03,1.1,138,1,1,250,0 +50,1,54,0,40,0,279000,0.8,141,1,0,250,0 +55,1,170,1,40,0,336000,1.2,135,1,0,250,0 +60,0,253,0,35,0,279000,1.7,140,1,0,250,0 +45,0,582,1,55,0,543000,1,132,0,0,250,0 +65,0,892,1,35,0,263358.03,1.1,142,0,0,256,0 +90,1,337,0,38,0,390000,0.9,144,0,0,256,0 +45,0,615,1,55,0,222000,0.8,141,0,0,257,0 +60,0,320,0,35,0,133000,1.4,139,1,0,258,0 +52,0,190,1,38,0,382000,1,140,1,1,258,0 +63,1,103,1,35,0,179000,0.9,136,1,1,270,0 +62,0,61,1,38,1,155000,1.1,143,1,1,270,0 +55,0,1820,0,38,0,270000,1.2,139,0,0,271,0 +45,0,2060,1,60,0,742000,0.8,138,0,0,278,0 +45,0,2413,0,38,0,140000,1.4,140,1,1,280,0 +50,0,196,0,45,0,395000,1.6,136,1,1,285,0 diff --git a/screenshot.pdf b/screenshot.pdf new file mode 100644 index 0000000..c5b4342 Binary files /dev/null and b/screenshot.pdf differ diff --git a/writeup.md b/writeup.md new file mode 100644 index 0000000..9be09b9 --- /dev/null +++ b/writeup.md @@ -0,0 +1,23 @@ + +## Project Goals + +In the project, I would like to let the user to explore how different factors, including biological features, other existing symotoms, and behavioural habbits, to better predict heart failure. + + +## Design + +1) In section 1, I show the overview of the whole dataset by showing the first five rows of data as a dataframe to give user a sense of what does the data look like. I also categorize the data into categorical and quantitative, and let the user to choose whether checking the statistics of quantitative data. The reasons why I split the data into categorical and quantitative is because many further data transformations and analytical approaches are different for those two categories, so I split them in the beginning. + +2) In section 2, I explored the correlation variables among all variables, by splitting variables in categorical and quantitative variables to insepect the correlations among variables. I design the multiselect button to enable user to select which kind of variables to inspect. + +3) In section 3.1, I let the user to choose to visualize the distribution of each variable by splitting variables in categorical and quantitative variables as well. The reason why I didn't choose to let the user to choose every single variable is becasue I think it is better to let the user compare distributions between other variables. + +4) In section 3.2, from the previous visualizations, I found it very interesting that gender seems to be not correlated to the symptoms at all, so I investigate on how does heart failure among gender based on smoking and blood pressure, and anaemia and diabetes. I let user to choose whether he/she wants to further investigate on this. + +5) In section 4, based on the statistics/distributions/visualizations I provided before, I let the user to customize what variables seems to be the most important in predicting heart failure, and let user to choose between five popular machine learning models. They would see the accuracies and try different combinations of features and explore different models to validate their intuitions. + + + +## Develop + +I made the whole app by myself, and spending in total about 20 hours in total. The design part is the most difficult: how to make the interaction more fascinating and engaging? \ No newline at end of file