IDSF21 · github-classroom · Oct 17, 2021 · Oct 20, 2021 · Oct 20, 2021 · Oct 20, 2021
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 lalopark
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,15 @@
+# 1DS_HW2
+
+## A clear description of the goals of your project. 
+The goal of our project was to dive into an HR Analytics dataset from Kaggle and uncover trends in the data scientist hiring process by analyzing the candidate pool in terms of various demographic and professional characteristics. As students and current practitioners of data science, we are always intrigued by questions such as “What makes a good candidate for a data scientist?” and “Where do I stand in the job market in this region for this year?” Our motivation was to dissect the anonymized dataset with as much detail as possible from gender, education level, city, and years of experience to understand which candidates may be looking for jobs, whether it’s their very first foot in the door or a second/third position as a Data Scientist.  
+
+## A rationale for your design decisions. 
+We decided to include 
+1) A navigation for the users to easily toggle between the intro, the write up, the different data analyses, both macro and micro (gender and education-specific), and also our own contact information if they wish to get in touch with us for further inquiry. 
+2) A user form where we predict the visitor’s likelihood of looking for a new job as a Data Scientist based on the provided dataset to engage and encourage the users to not only understand the general summary statistics of the given dataset but also to utilize it to see how it can applied to their current employment/job search situation. After the user types in their name, we thank them with a green popup “Thanks, 'user’s name!'" to motivate them to provide further information. We made the years of experience response a range scale and the rest a dropdown style, so that we receive as clean of data points as possible. We made sure to layer with nuance, that in cases where it’s difficult to accurately make the prediction, we surface the message “we’ll need more information from you!” to inform them of the limitations of the anonymized dataset. The alternative solution we considered was creating a separate tab for this, but ultimately decided to keep it as a sidebar that follows the user across different tabs, to keep the app dynamic and fun. 
+3) Box plots, bar graphs, pie graphs, scatter plots, and snapshots of the raw dataset along with an interactive drop down filters just for gender and education where once the user selects a specific gender/education, they're able to further drill down the visualizations. We hypothesized that gender and education biases play a significant role in HR analytics especially in tech and in the data science function and also hypothesized that our visitors are curious to learn the insights.
+
+## An overview of your development process.
+Each of us created more than six graphs per person. Before we chose the datasets, we discussed our common interests and found the datasets from kaggle. We met three times for two weeks and spent about 10 hours finishing this assignment. Since both of us did not have experience in developing apps or using streamlit, we first had to learn how to code in the Streamlit. Then, we brainstormed and planned out how to clean and analyze the dataset. After creating graphs, debugging took the most time. We had to figure out what caused the errors by searching the error code online. 
+
+APP URL(https://share.streamlit.io/lalopark/1ds_hw2/main/app/app.py)
diff --git a/app/app.py b/app/app.py
@@ -0,0 +1,84 @@
+#app.py
+import app1
+import app2
+import app3
+import app4
+import app5
+import pandas as pd
+
+import streamlit as st
+
+
+PAGES = {
+    "About the Dataset": app1,
+    "Analysis1": app3,
+    "Analysis2": app2,
+    "Writeup": app4,
+    'Developer Contact': app5
+    }
+
+st.sidebar.title('HW2 for 05839 CMU (Interactive Data Science): Page Navigation')
+selection = st.sidebar.selectbox("Go to", list(PAGES.keys()))
+page = PAGES[selection]
+page.app()
+
+ #sidebar section
+# data manipulation
+train = pd.read_csv('app/aug_train.csv')
+train = train.replace({'company_size': '10/49'}, '10-49')
+train = train.replace({'company_size': '<10'}, '1-9')
+train = train.replace({'company_size': '100-500'}, '100-499')
+train = train.replace({'education_level': 'Graduate'}, 'Undergraduate')
+train = train.replace({'relevent_experience': 'Has relevent experience'}, 'Yes')
+train = train.replace({'relevent_experience': 'No relevent experience'}, 'No')
+train = train.fillna(value={'gender':'Female'})
+train = train.fillna('other')
+
+# group by 
+group = train.groupby(['gender', 'relevent_experience', 'education_level', 'major_discipline', 'experience', 'last_new_job'])['target'].mean()
+group = pd.DataFrame(group).reset_index()
+
+with st.form(key ='Form1'):
+    with st.sidebar:
+        name = st.text_input("What is your name?")
+        if name != '':
+            st.sidebar.success('Thanks, '+name+'!')
+
+        rel = st.radio('Do you have previous relevant experience as a Data Scientist?',\
+                       ('Yes', 'No'))
+
+        experience = st.sidebar.slider('How many years of experience do you have?', 0, 30) 
+        if int(experience) <=1: 
+            experience = '<1'
+        elif int(experience) >= 20: 
+            experience = '>20'
+        else:
+            experience = str(experience)
+        last_job = st.sidebar.selectbox("How many years did you stay at your previous job?", \
+                                        ['1', '2', '3', '4', '>4', 'other', 'never'])
+        edu = st.sidebar.selectbox("What is the your highest degree achieved?",\
+                               ['Phd', 'Masters', 'Undergraduate','High School','Primary School'])
+
+        major = st.sidebar.selectbox("Which discipline did you major in?",\
+                         ['Arts', 'Business Degree', 'Humanities','STEM','No Major', 'Other'])
+
+        gen = st.sidebar.radio('Last but not least, what is your gender?',\
+                               ('Male', 'Female', 'Other/Non-binary'))
+        submitted1 = st.form_submit_button(label = 'Predict')
+
+val = group.loc[(group['gender'] == str(gen))\
+                & (group['relevent_experience'] ==str(rel))\
+                & (group['education_level'] == str(edu))\
+                & (group['major_discipline'] == str(major))\
+                & (group['experience'] == str(experience))\
+                & (group['last_new_job'] == str(last_job))]['target']
+
+if val.empty:
+    #st.write('We’ll need more information to guess your next move!')
+    st.sidebar.success('We’ll need more information to guess your next move!')
+else:
+    #st.write('Your probability of getting data scientist position is ',round(float(val)*100,3),'%')
+    st.sidebar.success('Your probability of getting data scientist position is '+ str(round(float(val)*100,3)) + '.')
+
+
+
diff --git a/app/app1.py b/app/app1.py
@@ -0,0 +1,47 @@
+# app1.py
+import streamlit as st
+import pandas as pd
+
+def app():
+    st.image('app/data_scientist.png',use_column_width=True)
+    st.title('Context')
+    st.write('The demand for data scientists has been steadily increasing as the job title is often referenced as "the sexiest job of the 21st century.” \
+    As students of data science, we were interested in diving into the HR analytics involving data science practitioners,\
+    hence obtained a dataset from [Kaggle](https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists)"\
+    that includes the demographic, education, and experience information of data scientist candidates for a company. ')
+    st.title('About the Dataset')
+    st.write('This dataset includes 14 features of 19093 data scientists such as demographics, education, and experience level\
+    and is divided into test and train, the latter containing an additional column of “whether the candidate is currently looking\
+    for a job” which we’ve utilized to create an interactive sidebar feature to collect the user’s input responses to predict their\
+    likelihood of looking for a new job at the moment, based on the provided data set. We’d like to inform the users and our site\
+    visitors that the provided dataset is quite imbalanced on several demographical facets, hence in cases when the probability,\
+    we’ve made sure to output a qualifying statement: “we’ll need more information to guess your next move!”')
+
+    # dictionary with list object in values
+    st.title('What the Dataset Looks Like')
+    details = {
+        'Column' : ['enrollee_id','city','city_development_index','Gender',\
+                    'relevent_experience','enrolled_university','education_level',\
+                    'major_discipline','experience','company_size','company_type',\
+                    'last_new_job','training_hours','target'],
+        'Description' : ['Unique ID for candidate',\
+                         'City code',\
+                         'Developement index of the city (scaled)',\
+                         'Gender of candidate',\
+                         'Relevant experience of candidate',\
+                         'Type of University course enrolled if any',\
+                         'Education level of candidate',\
+                         'Education major discipline of candidate',\
+                         'Candidate total experience in years',\
+                         "Number of employees in current employer's company",\
+                         'Type of current employer',\
+                         'Difference in years between previous job and current job',\
+                         'Completed training hours',\
+                         'Whether the candidate is currently looking for a job']}
+
+
+    train = pd.read_csv('app/aug_train.csv')
+    st.write(train)
+
+
+
diff --git a/app/app2.py b/app/app2.py
@@ -0,0 +1,120 @@
+# app2.py
+
+import numpy as np
+import pandas as pd
+import altair as alt
+import plotly.express as px
+import plotly.graph_objs as go
+import pickle as pkle
+import os.path
+import streamlit as st
+
+def app():
+    st.title('Analysis2')
+    train = pd.read_csv('app/aug_train.csv')
+
+    train = train.replace({'company_size': '10/49'}, '10-49')
+    train = train.replace({'company_size': '<10'}, '1-9')
+    train = train.replace({'company_size': '100-500'}, '100-499')
+    train = train.replace({'education_level': 'Graduate'}, 'Undergraduate')
+    train = train.replace({'major': 'Other'}, 'other_major')
+    train = train.fillna(value={'gender':'Female'})
+    train = train.fillna('Other')
+
+    train.rename(columns={'gender': 'Gender', 'Education_level': 'Education_level'})
+
+
+
+    #10/16
+    train = pd.read_csv('app/aug_train.csv')
+    exp_clean = list()
+    nj_clean = list()
+    for i in train['experience'].astype(str):
+        exp_clean.append(''.join(e for e in i if e.isalnum()))
+    for i in train['last_new_job'].astype(str): 
+        nj_clean.append(''.join(e for e in i if e.isalnum()))
+
+    train['experience'] = exp_clean
+    train['last_new_job'] = nj_clean
+
+    train['experience'] = pd.to_numeric(train['experience'], errors='coerce')
+    train['last_new_job'] = pd.to_numeric(train['last_new_job'], errors='coerce')
+
+    df = train[train['experience'].notna()]
+    df['experience'] = df['experience'].astype(int)
+    df2 = train[train['last_new_job'].notna()]
+    df2['last_new_job'] = df2['last_new_job'].astype(int)
+    df2 = df2.replace({'gender': 'Other'}, 'Female')
+    df2 = df2.fillna(value={'gender':'Female'})
+
+    imbalance = df2.groupby('gender').count().reset_index()
+    imbalance = imbalance.rename(columns={'enrollee_id': 'Count','gender':'Gender'})
+
+    c = alt.Chart(imbalance, title = 'Gender Imbalanced Data').mark_bar().encode(
+        x="Gender",
+        y="Count"
+    ).properties(width=600, height=400).configure_axisX(labelAngle=45).interactive()
+
+    st.altair_chart(c)
+
+    final = df2.groupby(['gender','last_new_job','experience']).count().reset_index()
+    fig = px.scatter(x=list(final['experience']), y=list(final['last_new_job']),
+                color=final["gender"],size = final['major_discipline'])
+
+    st.plotly_chart(fig)
+
+
+
+
+
+
+    Gender = ['Female','Male']
+    selected_gender = st.multiselect('Gender', Gender, Gender)
+
+    Education = ['Phd','Master','Undergraduate','High School']
+    selected_education = st.multiselect('Education_level', Education, Education)
+
+    df_selected = train[(train.gender.isin(selected_gender) & train.education_level.isin(selected_education))]
+
+    # Education_level bar graph
+    st.header('Take a closer look at the dataset based on gender and education levels. Try clicking the filters above!')
+    st.write('Data Dimension: ' + str(df_selected.shape[0]) + ' rows and ' + str(df_selected.shape[1]) + ' columns.')
+    university_df = df_selected.groupby(['enrolled_university']).count().reset_index()[['enrolled_university','enrollee_id']].rename(columns={'enrollee_id':'Count'})
+    #st.dataframe(university_df)
+    #s = df_selected['enrolled_university'].value_counts()
+    #st.bar_chart(s)
+
+    c = alt.Chart(university_df, title = 'Enrolled University Type').mark_bar().encode(
+        x="enrolled_university",
+        y="Count"
+    ).properties(width=600, height=400).configure_axisX(labelAngle=45).interactive()
+    st.altair_chart(c)
+
+
+    # major_discipline bar graph
+    major_df = df_selected.groupby(['major_discipline']).count().reset_index()[['major_discipline','enrollee_id']].rename(columns={'enrollee_id':'Count'})
+    #st.dataframe(major_df)
+
+    c = alt.Chart(major_df, title = 'Major' ).mark_bar().encode(
+        x="major_discipline",
+        y="Count"
+    ).properties(width=600, height=400).configure_axisX(labelAngle=45).interactive()
+    st.altair_chart(c)
+
+
+    company_size_type = df_selected.groupby(['company_size','company_type']).count().reset_index()[['company_size','company_type','enrollee_id']].rename(columns={'enrollee_id':'Count'})
+    #st.dataframe(company_size_type)
+    categoryNames = ['1-9','10-49','50-99','100-499','500-999','1000-4999','5000-9999','10000+' ]
+    c = alt.Chart(company_size_type, title = 'Company Size').mark_bar().encode(
+        alt.X("company_size",sort=categoryNames),
+        y="Count",
+        color = 'company_type'
+    ).properties(width=600, height=400).configure_axisX(labelAngle=45).interactive()
+    st.altair_chart(c)
+
+
+
+
+
+
+
diff --git a/app/app3.py b/app/app3.py
@@ -0,0 +1,74 @@
+# app3.py
+
+import numpy as np
+import pandas as pd
+import altair as alt
+import plotly.express as px
+import plotly.graph_objs as go
+import pickle as pkle
+import os.path
+import streamlit as st 
+import base64
+import seaborn as sns
+import plotly.express as px
+import re
+
+
+
+
+def app():
+    st.title('Analysis1')
+    train = pd.read_csv('app/aug_train.csv')
+
+    exp_clean = list()
+    nj_clean = list()
+    for i in train['experience'].astype(str):
+        exp_clean.append(''.join(e for e in i if e.isalnum()))
+    for i in train['last_new_job'].astype(str): 
+        nj_clean.append(''.join(e for e in i if e.isalnum()))
+
+    train['experience'] = exp_clean
+    train['last_new_job'] = nj_clean
+
+    train['experience'] = pd.to_numeric(train['experience'], errors='coerce')
+    train['last_new_job'] = pd.to_numeric(train['last_new_job'], errors='coerce')
+
+    df = train[train['experience'].notna()]
+    df['experience'] = df['experience'].astype(int)
+    df = df.replace({'education_level': 'Graduate'}, 'Undergraduate')
+    df2 = train[train['last_new_job'].notna()]
+    df2['last_new_job'] = df2['last_new_job'].astype(int)
+
+    fig1 = px.histogram(train, x= 'city_development_index', nbins = 50, title='City Development Index Distribution of Candidates')
+    st.plotly_chart(fig1) 
+
+
+
+    fig5 = px.pie(df, names='education_level', title='Education Levels of Candiates') 
+    st.plotly_chart(fig5)
+
+    df_melt = train.melt(id_vars='education_level', value_vars='experience')
+    box2 = px.box(df_melt, x="education_level", y="value", title='Experience Distribution by Education Level')
+    st.plotly_chart(box2)
+
+    df_melt_3 = train.melt(id_vars='major_discipline', value_vars='experience')
+    box3 = px.box(df_melt_3, x="major_discipline", y="value", title='Experience Distribution by Major')
+    st.plotly_chart(box3)
+
+    fig2 = px.histogram(train, x= 'training_hours', nbins = 50, title='Training Hour Distribution of Candidates')
+    st.plotly_chart(fig2) 
+
+    group = train.groupby(['training_hours','relevent_experience']).size()
+    group = group.reset_index()
+    group.columns.values[2] = "count" 
+    fig6 = px.line(group, x="training_hours", y="count", color="relevent_experience", title='Density Curve of Traiing Hours For Candidates With Relevant Experience vs Those Without')
+    st.plotly_chart(fig6)
+
+    fig3 = px.histogram(df, x='experience', nbins=50, title='Experience Distribution of Candidates')
+    st.plotly_chart(fig3)
+
+    fig4 = px.histogram(df2, x='last_new_job', nbins=5, title='Last New Job Distribution of Candidates')
+    st.plotly_chart(fig4)
+
+
+