Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
973 changes: 355 additions & 618 deletions 8Knot/assets/landing_page.css

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions 8Knot/db_manager/augur_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,7 @@ def run_query(self, query_string: str) -> pd.DataFrame:
except:
raise Exception("DB Read Failure")

result_df = result_df.reset_index()
result_df.drop("index", axis=1, inplace=True)
result_df = result_df.reset_index(drop=True)

return result_df

Expand Down Expand Up @@ -200,7 +199,7 @@ def multiselect_startup(self):
# used when the user selects an org
# Output is of the form: {group_name: [rid1, rid2, ...], group_name: [...], ...}
df_lower_repo_names = df_search_bar.copy()
df_lower_repo_names["rg_name"] = df_lower_repo_names["rg_name"].apply(str.lower)
df_lower_repo_names["rg_name"] = df_lower_repo_names["rg_name"].str.lower()
self.org_name_to_repos_dict = df_lower_repo_names.groupby("rg_name")["repo_id"].apply(list).to_dict()
self.org_names = list(self.org_name_to_repos_dict.keys())

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def process_data(df: pd.DataFrame, action_type, top_k, start_date, end_date):
df = (df.groupby("cntrb_id")["Action"].count()).to_frame()

# sort rows according to amount of contributions from greatest to least
df.sort_values(by="Action", ascending=False, inplace=True)
df = df.sort_values(by="Action", ascending=False)

df = df.reset_index()

Expand Down
20 changes: 11 additions & 9 deletions 8Knot/pages/chaoss/visualizations/project_velocity.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,12 +352,14 @@ def process_data(
df_consolidated = pd.concat([df_actions, df_cntrbs], axis=1).reset_index()

# replace all nan to 0
df_consolidated.fillna(value=0, inplace=True)

# log of commits and contribs if values are not 0
df_consolidated["log_num_commits"] = df_consolidated["Commit"].apply(lambda x: math.log(x) if x != 0 else 0)
df_consolidated["log_num_contrib"] = df_consolidated["num_unique_contributors"].apply(
lambda x: math.log(x) if x != 0 else 0
df_consolidated = df_consolidated.fillna(value=0)

# log of commits and contribs if values are not 0 (vectorized with np.log)
df_consolidated["log_num_commits"] = np.where(df_consolidated["Commit"] != 0, np.log(df_consolidated["Commit"]), 0)
df_consolidated["log_num_contrib"] = np.where(
df_consolidated["num_unique_contributors"] != 0,
np.log(df_consolidated["num_unique_contributors"]),
0,
)

# column to hold the weighted values of pr and issues actions summed together
Expand All @@ -370,10 +372,10 @@ def process_data(
)

# after weighting replace 0 with nan for log
df_consolidated["prs_issues_actions_weighted"].replace(0, np.nan, inplace=True)
df_consolidated["prs_issues_actions_weighted"] = df_consolidated["prs_issues_actions_weighted"].replace(0, np.nan)

# column for log value of pr and issue actions
df_consolidated["log_prs_issues_actions_weighted"] = df_consolidated["prs_issues_actions_weighted"].apply(math.log)
# column for log value of pr and issue actions (vectorized)
df_consolidated["log_prs_issues_actions_weighted"] = np.log(df_consolidated["prs_issues_actions_weighted"])

return df_consolidated

Expand Down
52 changes: 19 additions & 33 deletions 8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,8 @@ def directory_dropdown(repo_id):
df = df[df["rl_analysis_date"] == df["rl_analysis_date"].max()]

# drop unneccessary columns not needed after preprocessing steps
df = df.reset_index()
df.drop(
["index", "repo_id", "repo_name", "repo_path", "rl_analysis_date"],
axis=1,
inplace=True,
)
df = df.reset_index(drop=True)
df = df.drop(columns=["repo_id", "repo_name", "repo_path", "rl_analysis_date"])

# split file path by directory
df = df.join(df["file_path"].str.split("/", expand=True))
Expand Down Expand Up @@ -375,33 +371,31 @@ def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch
df_file["file_path"] = df_file["file_path"].str.rsplit(path_slice, n=1).str[1]

# drop unneccessary columns not needed after preprocessing steps
df_file = df_file.reset_index()
df_file.drop(["index", "repo_name", "repo_path", "rl_analysis_date"], axis=1, inplace=True)
df_file = df_file.reset_index(drop=True)
df_file = df_file.drop(columns=["repo_name", "repo_path", "rl_analysis_date"])

# split file path by directory
df_file = df_file.join(df_file["file_path"].str.split("/", expand=True))

# drop unnecessary columns
df_file.drop(["repo_id"], axis=1, inplace=True)
df_file_cntbs.drop(["repo_id", "reviewer_ids"], axis=1, inplace=True)
df_file = df_file.drop(columns=["repo_id"])
df_file_cntbs = df_file_cntbs.drop(columns=["repo_id", "reviewer_ids"])

# Left join on df_files to only get the files that are currently in the repository
# and the contributors that have ever reviewed a pr that included edits on the file
df_file = pd.merge(df_file, df_file_cntbs, on="file_path", how="left")
# replace nan with empty string to avoid errors in list comprehension
df_file.cntrb_ids.fillna("", inplace=True)
df_file["cntrb_ids"] = df_file["cntrb_ids"].fillna("")

# reformat cntrb_ids to list and remove bots if filter is on
# Vectorized: cntrb_ids is already a list after the fillna, so we convert strings to lists
if bot_switch:
df_file["cntrb_ids"] = df_file.apply(
lambda row: [x for x in row.cntrb_ids if x not in app.bots_list],
axis=1,
bots_set = set(app.bots_list)
df_file["cntrb_ids"] = df_file["cntrb_ids"].apply(
lambda ids: [x for x in ids if x not in bots_set] if isinstance(ids, list) else []
)
else:
df_file["cntrb_ids"] = df_file.apply(
lambda row: [x for x in row.cntrb_ids],
axis=1,
)
df_file["cntrb_ids"] = df_file["cntrb_ids"].apply(lambda ids: list(ids) if isinstance(ids, list) else [])

return df_file

Expand Down Expand Up @@ -453,10 +447,8 @@ def cntrb_per_directory_value(directory, df_file):
)

# Set of cntrb_ids to confirm there are no duplicate cntrb_ids
df_dynamic_directory["cntrb_ids"] = df_dynamic_directory.apply(
lambda row: set(row.cntrb_ids),
axis=1,
)
# Vectorized: use list comprehension instead of apply for simple set conversion
df_dynamic_directory["cntrb_ids"] = [set(ids) for ids in df_dynamic_directory["cntrb_ids"]]
return df_dynamic_directory


Expand Down Expand Up @@ -485,21 +477,15 @@ def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.Da
df_actions = df_actions.drop_duplicates(subset="cntrb_id", keep="first")

# drop unneccessary columns not needed after preprocessing steps
df_actions = df_actions.reset_index()
df_actions.drop(
["index", "repo_id", "repo_name", "login", "Action", "rank"],
axis=1,
inplace=True,
)
df_actions = df_actions.reset_index(drop=True)
df_actions = df_actions.drop(columns=["repo_id", "repo_name", "login", "Action", "rank"])

# dictionary of cntrb_ids and their most recent activity on repo
last_contrb = df_actions.set_index("cntrb_id")["created_at"].to_dict()

# get list of dates of the most recent activity for each contributor for each file
df_dynamic_directory["dates"] = df_dynamic_directory.apply(
lambda row: [last_contrb[x] for x in row.cntrb_ids],
axis=1,
)
# Vectorized: use list comprehension instead of apply
df_dynamic_directory["dates"] = [[last_contrb.get(x) for x in ids] for ids in df_dynamic_directory["cntrb_ids"]]

# reformat into each row being a directory value and a date of one of the contributors
# most recent activity - preprocessing step
Expand Down Expand Up @@ -549,7 +535,7 @@ def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions:
final = final.groupby(pd.Grouper(key="dates", freq="1M"))["directory_value"].value_counts().unstack(0)

# removing the None row that was used for column formating
final.drop("nan", inplace=True)
final = final.drop(index="nan")

# add back the files that had no contributors
for files in no_contribs:
Expand Down
47 changes: 18 additions & 29 deletions 8Knot/pages/codebase/visualizations/contribution_file_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,12 +204,8 @@ def directory_dropdown(repo_id):
df["file_path"] = df["file_path"].str.rsplit(path_slice, n=1).str[1]

# drop unneccessary columns not needed after preprocessing steps
df = df.reset_index()
df.drop(
["index", "repo_id", "repo_name", "repo_path", "rl_analysis_date"],
axis=1,
inplace=True,
)
df = df.reset_index(drop=True)
df = df.drop(columns=["repo_id", "repo_name", "repo_path", "rl_analysis_date"])

# split file path by directory
df = df.join(df["file_path"].str.split("/", expand=True))
Expand Down Expand Up @@ -383,15 +379,15 @@ def df_file_clean(df_file: pd.DataFrame, df_file_pr: pd.DataFrame):
df_file["file_path"] = df_file["file_path"].str.rsplit(path_slice, n=1).str[1]

# drop unneccessary columns not needed after preprocessing steps
df_file = df_file.reset_index()
df_file.drop(["index", "repo_name", "repo_path", "rl_analysis_date"], axis=1, inplace=True)
df_file = df_file.reset_index(drop=True)
df_file = df_file.drop(columns=["repo_name", "repo_path", "rl_analysis_date"])

# split file path by directory
df_file = df_file.join(df_file["file_path"].str.split("/", expand=True))

# drop unnecessary columns
df_file.drop(["repo_id"], axis=1, inplace=True)
df_file_pr.drop(["repo_id"], axis=1, inplace=True)
df_file = df_file.drop(columns=["repo_id"])
df_file_pr = df_file_pr.drop(columns=["repo_id"])

# create column with list of prs per file path
df_file_pr = df_file_pr.groupby("file_path")["pull_request_id"].apply(list)
Expand Down Expand Up @@ -449,10 +445,8 @@ def pr_per_directory_value(directory, df_file):
df_dynamic_directory.loc[df_dynamic_directory.pull_request_id == 0, "pull_request_id"] = ""

# Set of pull_request to confirm there are no duplicate pull requests
df_dynamic_directory["pull_request_id"] = df_dynamic_directory.apply(
lambda row: set(row.pull_request_id),
axis=1,
)
# Vectorized: use list comprehension instead of apply for simple set conversion
df_dynamic_directory["pull_request_id"] = [set(ids) for ids in df_dynamic_directory["pull_request_id"]]
return df_dynamic_directory


Expand Down Expand Up @@ -480,26 +474,21 @@ def pr_to_dates(df_pr: pd.DataFrame, df_dynamic_directory: pd.DataFrame, graph_v
df_pr["merged_at"] = pd.to_datetime(df_pr["merged_at"], utc=True)

# drop unneccessary columns not needed after preprocessing steps
df_pr.drop(
["repo_id", "repo_name", "pr_src_number", "cntrb_id", "closed_at"],
axis=1,
inplace=True,
)
df_pr = df_pr.drop(columns=["repo_id", "repo_name", "pr_src_number", "cntrb_id", "closed_at"])

# dictionaries of pull_requests and their open and merge dates
pr_open = df_pr.set_index("pull_request_id")["created_at"].to_dict()
pr_merged = df_pr.set_index("pull_request_id")["merged_at"].to_dict()

# get list of pr created and merged dates for each pr
df_dynamic_directory["created_at"], df_dynamic_directory["merged_at"] = zip(
*df_dynamic_directory.apply(
lambda row: [
[pr_open[x] for x in row.pull_request_id],
[pr_merged[x] for x in row.pull_request_id if (not pd.isnull(pr_merged[x]))],
],
axis=1,
)
)
# Vectorized: use list comprehension instead of apply
created_at_list = [[pr_open.get(x) for x in ids] for ids in df_dynamic_directory["pull_request_id"]]
merged_at_list = [
[pr_merged.get(x) for x in ids if not pd.isnull(pr_merged.get(x))]
for ids in df_dynamic_directory["pull_request_id"]
]
df_dynamic_directory["created_at"] = created_at_list
df_dynamic_directory["merged_at"] = merged_at_list

# reformat into each row being a directory value and a date of one of the pull request dates
df_dynamic_directory = df_dynamic_directory.explode(graph_view)
Expand Down Expand Up @@ -548,7 +537,7 @@ def file_pr_activity_by_month(df_dynamic_directory: pd.DataFrame, df_pr: pd.Data

# removing the None row that was used for column formating if exists
if "nan" in final.index:
final.drop("nan", inplace=True)
final = final.drop(index="nan")

# add back the files that had no pull requests
for files in no_contribs:
Expand Down
52 changes: 19 additions & 33 deletions 8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,8 @@ def directory_dropdown(repo_id):
df = df[df["rl_analysis_date"] == df["rl_analysis_date"].max()]

# drop unneccessary columns not needed after preprocessing steps
df = df.reset_index()
df.drop(
["index", "repo_id", "repo_name", "repo_path", "rl_analysis_date"],
axis=1,
inplace=True,
)
df = df.reset_index(drop=True)
df = df.drop(columns=["repo_id", "repo_name", "repo_path", "rl_analysis_date"])

# split file path by directory
df = df.join(df["file_path"].str.split("/", expand=True))
Expand Down Expand Up @@ -375,33 +371,31 @@ def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch
df_file["file_path"] = df_file["file_path"].str.rsplit(path_slice, n=1).str[1]

# drop unneccessary columns not needed after preprocessing steps
df_file = df_file.reset_index()
df_file.drop(["index", "repo_name", "repo_path", "rl_analysis_date"], axis=1, inplace=True)
df_file = df_file.reset_index(drop=True)
df_file = df_file.drop(columns=["repo_name", "repo_path", "rl_analysis_date"])

# split file path by directory
df_file = df_file.join(df_file["file_path"].str.split("/", expand=True))

# drop unnecessary columns
df_file.drop(["repo_id"], axis=1, inplace=True)
df_file_cntbs.drop(["repo_id", "cntrb_ids"], axis=1, inplace=True)
df_file = df_file.drop(columns=["repo_id"])
df_file_cntbs = df_file_cntbs.drop(columns=["repo_id", "cntrb_ids"])

# Left join on df_files to only get the files that are currently in the repository
# and the contributors that have ever reviewed a pr that included edits on the file
df_file = pd.merge(df_file, df_file_cntbs, on="file_path", how="left")
# replace nan with empty string to avoid errors in list comprehension
df_file.reviewer_ids.fillna("", inplace=True)
df_file["reviewer_ids"] = df_file["reviewer_ids"].fillna("")

# reformat reviewer_ids to list and remove bots if filter is on
# Vectorized: use set for O(1) lookup instead of list
if bot_switch:
df_file["reviewer_ids"] = df_file.apply(
lambda row: [x for x in row.reviewer_ids if x not in app.bots_list],
axis=1,
bots_set = set(app.bots_list)
df_file["reviewer_ids"] = df_file["reviewer_ids"].apply(
lambda ids: [x for x in ids if x not in bots_set] if isinstance(ids, list) else []
)
else:
df_file["reviewer_ids"] = df_file.apply(
lambda row: [x for x in row.reviewer_ids],
axis=1,
)
df_file["reviewer_ids"] = df_file["reviewer_ids"].apply(lambda ids: list(ids) if isinstance(ids, list) else [])
return df_file


Expand Down Expand Up @@ -452,10 +446,8 @@ def cntrb_per_directory_value(directory, df_file):
)

# Set of reviewer_ids to confirm there are no duplicate reviewer_ids
df_dynamic_directory["reviewer_ids"] = df_dynamic_directory.apply(
lambda row: set(row.reviewer_ids),
axis=1,
)
# Vectorized: use list comprehension instead of apply for simple set conversion
df_dynamic_directory["reviewer_ids"] = [set(ids) for ids in df_dynamic_directory["reviewer_ids"]]
return df_dynamic_directory


Expand Down Expand Up @@ -484,21 +476,15 @@ def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.Da
df_actions = df_actions.drop_duplicates(subset="cntrb_id", keep="first")

# drop unneccessary columns not needed after preprocessing steps
df_actions = df_actions.reset_index()
df_actions.drop(
["index", "repo_id", "repo_name", "login", "Action", "rank"],
axis=1,
inplace=True,
)
df_actions = df_actions.reset_index(drop=True)
df_actions = df_actions.drop(columns=["repo_id", "repo_name", "login", "Action", "rank"])

# dictionary of reviewer_ids and their most recent activity on repo
last_contrb = df_actions.set_index("cntrb_id")["created_at"].to_dict()

# get list of dates of the most recent activity for each contributor for each file
df_dynamic_directory["dates"] = df_dynamic_directory.apply(
lambda row: [last_contrb[x] for x in row.reviewer_ids],
axis=1,
)
# Vectorized: use list comprehension instead of apply
df_dynamic_directory["dates"] = [[last_contrb.get(x) for x in ids] for ids in df_dynamic_directory["reviewer_ids"]]

# reformat into each row being a directory value and a date of one of the contributors
# most recent activity - preprocessing step
Expand Down Expand Up @@ -548,7 +534,7 @@ def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions:
final = final.groupby(pd.Grouper(key="dates", freq="1M"))["directory_value"].value_counts().unstack(0)

# removing the None row that was used for column formating
final.drop("nan", inplace=True)
final = final.drop(index="nan")

# add back the files that had no contributors
for files in no_contribs:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def process_data(df: pd.DataFrame, interval):
# convert to datetime objects with consistent column name
# incoming value should be a posix integer.
df["author_date"] = pd.to_datetime(df["author_date"], utc=True)
df.rename(columns={"author_date": "created_at"}, inplace=True)
df = df.rename(columns={"author_date": "created_at"})

# variable to slice on to handle weekly period edge case
period_slice = None
Expand Down
Loading