page updates

jodiambra · May 13, 2023 · 27b4f47 · 27b4f47
1 parent 6b97ecc
commit 27b4f47
Show file tree

Hide file tree

Showing 7 changed files with 199 additions and 110 deletions.
diff --git a/Home.py b/Home.py
@@ -27,6 +27,7 @@
 
 
 st.title('Yachay.ai')
+"---"
 st.header('Text based geolocation prediction')
 
 # image1 = Image.open('images/tweet.png')

diff --git a/pages/1_EDA.py b/pages/1_EDA.py
@@ -0,0 +1,116 @@
+#import packages
+import numpy as np
+from numpy import genfromtxt
+import streamlit as st
+import pandas as pd
+import plotly_express as px
+from PIL import Image
+from streamlit.commands.page_config import Layout
+from sklearn.model_selection import train_test_split
+import tensorflow as tf 
+from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
+from sklearn.compose import make_column_transformer
+import seaborn as sns
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense
+from keras.optimizers import Adam
+import torch
+import transformers
+from tqdm.auto import tqdm
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+import math
+from sklearn.metrics.pairwise import haversine_distances
+from math import radians
+from streamlit.components.v1 import components
+import io 
+
+#----------------------------#
+# Upgrade streamlit library
+# pip install --upgrade streamlit
+
+#-----------------------------#
+# Page layout
+icon = Image.open('images/geo.ico')
+st.set_page_config(page_title='Yachay.ai Externship',
+                   page_icon=icon,
+                   layout='wide',
+                   initial_sidebar_state="auto",
+                   menu_items=None)
+
+st.title('Exploratory Data Analysis')
+"---"
+
+# read dataset
+df_main = pd.read_csv('data/Main_Dataset.csv',
+                      parse_dates=['timestamp'], index_col=['timestamp'])
+
+# sort by timestamp
+df_main.sort_index(inplace=True)
+
+# load cluster data
+df_cl = pd.read_csv('data/Clusters_Coordinates.csv')
+
+# Making timestamp features
+
+
+def make_features(data):
+    data['year'] = data.index.year
+    data['month'] = data.index.month
+    data['week'] = data.index.isocalendar().week
+    data['day'] = data.index.day
+    data['day_of_week'] = data.index.day_of_week
+    data['day_of_year'] = data.index.day_of_year
+    data['hour'] = data.index.hour
+    data['minute'] = data.index.minute
+    data['second'] = data.index.second
+
+
+make_features(df_main)
+
+
+# merge main and cluster coordinates
+df = df_main.merge(df_cl, on='cluster_id', sort=True)
+
+# drop missing values
+df.dropna(inplace=True)
+
+
+#------------------------------------------#
+
+# EDA section
+st.title('')
+st.subheader('Exploratory Data Analysis')
+columns = st.selectbox('Select Column', [
+                       'cluster_id', 'month', 'week', 'day', 'day_of_week', 'day_of_year', 'hour', 'minute', 'second', 'user_id'])
+
+@st.cache_data
+def plot_hist(df, columns):
+    fig = px.histogram(df[columns], title='Distribution of ' + str.upper(columns).replace(
+        '_', ' '), labels={'value': str(columns).replace('_', ' ')}, height=800, width=1200)
+    return fig
+
+
+st.plotly_chart(plot_hist(df, columns), use_container_width=True)
+
+"---"
+#--------------------#
+# Tweets dataframe
+st.title('')
+st.header('Tweets')
+
+# looking through tweets
+number = st.slider('Select Number of Tweets', 1, 1000, 10)
+
+@st.cache_data
+def tweet_lists(df, number):
+    tweets = df.text.tolist()
+    dataframe = tweets[:number]
+    return dataframe
+
+tweets = tweet_lists(df, number)
+st.dataframe(tweets, width=1500)
+
+"---"
+
diff --git a/pages/1_Maps.py → pages/2_Maps.py b/pages/1_Maps.py → pages/2_Maps.py
diff --git a/pages/2_Externship Data.py → pages/3_Modeling.py b/pages/2_Externship Data.py → pages/3_Modeling.py
@@ -26,6 +26,7 @@
 from streamlit.components.v1 import components
 import io 
 
+
 #----------------------------#
 # Upgrade streamlit library
 # pip install --upgrade streamlit
@@ -39,78 +40,7 @@
                    initial_sidebar_state="auto",
                    menu_items=None)
 
-st.title('Yachay.ai')
-st.subheader('Tweet Geolocation Prediction')
-
-# read dataset
-df_main = pd.read_csv('data/Main_Dataset.csv',
-                      parse_dates=['timestamp'], index_col=['timestamp'])
-
-# sort by timestamp
-df_main.sort_index(inplace=True)
-
-# load cluster data
-df_cl = pd.read_csv('data/Clusters_Coordinates.csv')
-
-# Making timestamp features
-
-
-def make_features(data):
-    data['year'] = data.index.year
-    data['month'] = data.index.month
-    data['week'] = data.index.isocalendar().week
-    data['day'] = data.index.day
-    data['day_of_week'] = data.index.day_of_week
-    data['day_of_year'] = data.index.day_of_year
-    data['hour'] = data.index.hour
-    data['minute'] = data.index.minute
-    data['second'] = data.index.second
-
-
-make_features(df_main)
-
-
-# merge main and cluster coordinates
-df = df_main.merge(df_cl, on='cluster_id', sort=True)
-
-# drop missing values
-df.dropna(inplace=True)
-
-
-#------------------------------------------#
-
-# EDA section
-st.title('')
-st.subheader('Exploratory Data Analysis')
-columns = st.selectbox('Select Column', [
-                       'cluster_id', 'month', 'week', 'day', 'day_of_week', 'day_of_year', 'hour', 'minute', 'second', 'user_id'])
-
-@st.cache_data
-def plot_hist(df, columns):
-    fig = px.histogram(df[columns], title='Distribution of ' + str.upper(columns).replace(
-        '_', ' '), labels={'value': str(columns).replace('_', ' ')}, height=800, width=1200)
-    return fig
-
-
-st.plotly_chart(plot_hist(df, columns), use_container_width=True)
-
-"---"
-#--------------------#
-# Tweets dataframe
-st.title('')
-st.header('Tweets')
-
-# looking through tweets
-number = st.slider('Select Number of Tweets', 1, 1000, 10)
-
-@st.cache_data
-def tweet_lists(df, number):
-    tweets = df.text.tolist()
-    dataframe = tweets[:number]
-    return dataframe
-
-tweets = tweet_lists(df, number)
-st.dataframe(tweets, width=1500)
+st.title('Model Architecture')
 
 "---"
 #--------------------------#
@@ -123,6 +53,7 @@ def tweet_lists(df, number):
         'resource intensity of processing text, we have provided a sample of 1,000 rows of pre-processed text with the various models.')
 st.subheader('')
 
+"---"
 
 #---------------------------# 
 # button columns for loading specific modeled data
@@ -298,41 +229,3 @@ def loss_haversine(observation, prediction):
 else:
     st.warning('Pick an NLP model first, change parameters, then click run', icon='🏃‍♂️') 
 
-#--------------------------------# 
-
-"---"
-st.title('')
-st.header('NLP Feature Engineering')
-# sentiment analysis
-st.subheader('Sentiment Analysis')
-sent = pd.read_csv('inputs/sent.csv', header=None)
-sent_counts =sent[0].value_counts()
-st.plotly_chart(px.bar(sent_counts, color=sent_counts.index,  title='Tweet Sentiment', height=600, width=800, 
-        template='plotly_dark', labels={'value': 'Sentiment'}))
-
-# Language detection
-st.subheader('Language Detection')
-language = pd.read_csv('inputs/lan.csv', header=None)
-# counts of the different languages
-lan_counts = language[0].value_counts()
-st.plotly_chart(px.bar(lan_counts, color=lan_counts.index, title='Tweet Languages', height=800, width=1200, 
-        template='plotly_white', labels={'index': 'Languages', 'value': 'Count'}), use_container_width=True)
-# Topics analysis
-st.subheader('Topics Analysis')
-topics = pd.read_csv('inputs/topics.csv', header=None)
-topic_counts= topics[0].value_counts()
-st.plotly_chart(px.bar(topic_counts, color=topic_counts.index, title='Tweet Topics', height=800, width=1200, 
-        template='plotly_dark', labels={'index': 'Topics', 'value': 'Count'}), use_container_width=True)
-
-# Name Entity analysis
-st.subheader('Name Entity Analysis')
-entity = pd.read_csv('inputs/ner.csv', header=None)
-entity_counts= entity[0].value_counts()
-st.plotly_chart(px.bar(entity_counts, color=entity_counts.index, title='Tweet Entities', height=800, width=1200, 
-        template='plotly_dark', labels={'index': 'Entities', 'value': 'Count'}), use_container_width=True)
-
-"---"
-#-----------------------------------#
-
-st.title('')
-st.subheader('Hypothesis Testing')
diff --git a/pages/4_NLP Features.py b/pages/4_NLP Features.py
@@ -0,0 +1,79 @@
+#import packages
+import numpy as np
+from numpy import genfromtxt
+import streamlit as st
+import pandas as pd
+import plotly_express as px
+from PIL import Image
+from streamlit.commands.page_config import Layout
+from sklearn.model_selection import train_test_split
+import tensorflow as tf 
+from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
+from sklearn.compose import make_column_transformer
+import seaborn as sns
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense
+from keras.optimizers import Adam
+import torch
+import transformers
+from tqdm.auto import tqdm
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+import math
+from sklearn.metrics.pairwise import haversine_distances
+from math import radians
+from streamlit.components.v1 import components
+import io 
+
+
+#----------------------------#
+# Upgrade streamlit library
+# pip install --upgrade streamlit
+
+#-----------------------------#
+# Page layout
+icon = Image.open('images/geo.ico')
+st.set_page_config(page_title='Yachay.ai Externship',
+                   page_icon=icon,
+                   layout='wide',
+                   initial_sidebar_state="auto",
+                   menu_items=None)
+
+st.title('Natural Language Processing Features')
+st.subheader('')
+# sentiment analysis
+st.subheader('Sentiment Analysis')
+sent = pd.read_csv('inputs/sent.csv', header=None)
+sent_counts =sent[0].value_counts()
+st.plotly_chart(px.bar(sent_counts, color=sent_counts.index,  title='Tweet Sentiment', height=600, width=800, 
+        template='plotly_dark', labels={'value': 'Sentiment'}))
+
+"---"
+
+# Language detection
+st.subheader('Language Detection')
+language = pd.read_csv('inputs/lan.csv', header=None)
+# counts of the different languages
+lan_counts = language[0].value_counts()
+st.plotly_chart(px.bar(lan_counts, color=lan_counts.index, title='Tweet Languages', height=800, width=1200, 
+        template='plotly_white', labels={'index': 'Languages', 'value': 'Count'}), use_container_width=True)
+
+"---"
+# Topics analysis
+st.subheader('Topics Analysis')
+topics = pd.read_csv('inputs/topics.csv', header=None)
+topic_counts= topics[0].value_counts()
+st.plotly_chart(px.bar(topic_counts, color=topic_counts.index, title='Tweet Topics', height=800, width=1200, 
+        template='plotly_dark', labels={'index': 'Topics', 'value': 'Count'}), use_container_width=True)
+
+"---"
+
+# Name Entity analysis
+st.subheader('Name Entity Analysis')
+entity = pd.read_csv('inputs/ner.csv', header=None)
+entity_counts= entity[0].value_counts()
+st.plotly_chart(px.bar(entity_counts, color=entity_counts.index, title='Tweet Entities', height=800, width=1200, 
+        template='plotly_dark', labels={'index': 'Entities', 'value': 'Count'}), use_container_width=True)
+
+"---"
diff --git a/pages/3_Final Model.py → pages/5_Final Model.py b/pages/3_Final Model.py → pages/5_Final Model.py
diff --git a/pages/4_Research Articles.py → pages/6_Research Articles.py b/pages/4_Research Articles.py → pages/6_Research Articles.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -27,6 +27,7 @@ @@
     st.title('Yachay.ai')
+    "---"
     st.header('Text based geolocation prediction')
     # image1 = Image.open('images/tweet.png')
@@ Expand Down @@