-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexplore.py
66 lines (52 loc) · 1.53 KB
/
explore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
import streamlit as st
from data_tools import (
st_dataset_selector,
load_dataset,
time_it,
one_hot_encode_authors,
add_author_prominence_feature,
)
st.header("Data Exploration")
# Wrap methods with timer:
load_dataset = time_it(
lambda ret: "Loading dataset ({} docs)".format((len(ret[0]))),
load_dataset,
)
one_hot_encode_authors = time_it("One-hot encoding authors", one_hot_encode_authors)
docs_limit = st.number_input(
"Max limit of docs to parse (more than 10000 items will be slow)",
value=1000,
step=50,
)
selected_dataset = st_dataset_selector()
raw_docs, author_map = load_dataset(selected_dataset, docs_limit)
st.subheader("Raw docs shape")
raw_docs.shape
st.subheader("First 10 papers")
st.write(raw_docs.head(10))
st.subheader("Features")
st.write(", ".join(raw_docs.columns))
from correlation_study import run_correlation_study
run_correlation_study(raw_docs, author_map)
from distribution_study import run_distribution_study
run_distribution_study(raw_docs)
from vectorize_text_study import run_vectorize_text_study
st.markdown(
"""
## Vectorizing abstracts
**Warning:** This is slow for n > 10 000 docs
"""
)
if st.button("Vectorize text"):
run_vectorize_text_study(raw_docs)
st.markdown(
"""
## One-hot encoding authors
**Warning:** This is slow on n > 10 000 docs
"""
)
if st.button("Run one-hot encoding"):
one_hot_encoded = one_hot_encode_authors(raw_docs)
st.subheader("One-hot-encoded authors shape")
one_hot_encoded.shape