-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorrelation_study.py
90 lines (75 loc) · 2.96 KB
/
correlation_study.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import streamlit as st
import data_tools as dt
def run_correlation_study(raw_docs, author_map):
st.title("Correlation Study")
st.header("1. Dependent Variable Analysis")
dt.describe(raw_docs.Rank, title="Rank", xlabel="Rank (discrete)")
dt.describe(
raw_docs.CitationCount,
title="Citation Count",
xlabel="Citation Count (discrete)",
)
st.subheader("Rank vs. Citation Count (Small correlation)")
dt.correlation(raw_docs, ["Rank", "CitationCount"])
dt.show_relative_scatter(raw_docs, "Rank", "CitationCount")
df = dt.add_author_prominence_feature(raw_docs, author_map)
st.subheader("Rank vs. Author Prominence")
dt.correlation(df, ["Rank", "AuthorProminence"])
dt.show_relative_scatter(df, "Rank", "AuthorProminence")
st.subheader("CitationCount vs. Author Prominence")
dt.correlation(df, ["CitationCount", "AuthorProminence"])
dt.show_relative_scatter(df, "CitationCount", "AuthorProminence")
st.header("2. Independent Variable Analysis")
st.subheader("DocType")
df = raw_docs.copy()
df["PageCount"] = dt.get_page_count(df["FirstPage"].values, df["LastPage"].values)
journals = df[df["DocType"] == "Journal"]
dt.describe(journals.Rank, title="Journal Rank", xlabel="Rank (discrete)")
dt.describe(
journals.CitationCount,
title="Journal Citation Count",
xlabel="Citation Count (discrete)",
)
books = df[df["DocType"] == "Book"]
dt.describe(books.Rank, title="Book Rank", xlabel="Rank (discrete)")
dt.describe(
books.CitationCount,
title="Book Citation Count",
xlabel="Citation Count (discrete)",
)
patents = df[df["DocType"] == "Patent"]
dt.describe(patents.Rank, title="Patent Rank", xlabel="Rank (discrete)")
dt.describe(
patents.CitationCount,
title="Patent Citation Count",
xlabel="Citation Count (discrete)",
)
conference_papers = df[df["DocType"] == "Conference"]
dt.describe(
conference_papers.Rank, title="Conference Rank", xlabel="Rank (discrete)"
)
dt.describe(
conference_papers.CitationCount,
title="Conference Citation Count",
xlabel="Citation Count (discrete)",
)
st.subheader("Correlation Analysis - With Journal dataset")
field_of_study = [col for col in raw_docs if col.startswith("FieldOfStudy_")]
str_cols = dt.get_string_columns(
journals,
include=["Publisher", "JournalName", *field_of_study[:3]],
)
le_dic = {}
journals = dt.encode_categorical(journals, str_cols, le_dic)
st.write(journals.head())
y_columns = ["Rank", "CitationCount"]
df_0, df_1, df_2, df_3 = dt.separate_datasets(
journals,
[],
["JournalName", "Publisher", "FirstPage", "LastPage", *field_of_study[:3]],
["PageCount", *field_of_study[:3]],
["Title", "Abstract"],
y_columns=y_columns,
)
for df in [df_1, df_2]:
dt.correlation(df, plot=True)