oss-aspen · EngCaioFonseca · Oct 7, 2025 · Oct 13, 2025 · Dec 13, 2025
diff --git a/8Knot/assets/landing_page.css b/8Knot/assets/landing_page.css
diff --git a/8Knot/db_manager/augur_manager.py b/8Knot/db_manager/augur_manager.py
@@ -154,8 +154,7 @@ def run_query(self, query_string: str) -> pd.DataFrame:
         except:
             raise Exception("DB Read Failure")
 
-        result_df = result_df.reset_index()
-        result_df.drop("index", axis=1, inplace=True)
+        result_df = result_df.reset_index(drop=True)
 
         return result_df
 
@@ -200,7 +199,7 @@ def multiselect_startup(self):
         # used when the user selects an org
         # Output is of the form: {group_name: [rid1, rid2, ...], group_name: [...], ...}
         df_lower_repo_names = df_search_bar.copy()
-        df_lower_repo_names["rg_name"] = df_lower_repo_names["rg_name"].apply(str.lower)
+        df_lower_repo_names["rg_name"] = df_lower_repo_names["rg_name"].str.lower()
         self.org_name_to_repos_dict = df_lower_repo_names.groupby("rg_name")["repo_id"].apply(list).to_dict()
         self.org_names = list(self.org_name_to_repos_dict.keys())
 

diff --git a/8Knot/pages/chaoss/visualizations/contrib_importance_pie.py b/8Knot/pages/chaoss/visualizations/contrib_importance_pie.py
@@ -267,7 +267,7 @@ def process_data(df: pd.DataFrame, action_type, top_k, start_date, end_date):
     df = (df.groupby("cntrb_id")["Action"].count()).to_frame()
 
     # sort rows according to amount of contributions from greatest to least
-    df.sort_values(by="Action", ascending=False, inplace=True)
+    df = df.sort_values(by="Action", ascending=False)
 
     df = df.reset_index()
 

diff --git a/8Knot/pages/chaoss/visualizations/project_velocity.py b/8Knot/pages/chaoss/visualizations/project_velocity.py
@@ -352,12 +352,14 @@ def process_data(
     df_consolidated = pd.concat([df_actions, df_cntrbs], axis=1).reset_index()
 
     # replace all nan to 0
-    df_consolidated.fillna(value=0, inplace=True)
-
-    # log of commits and contribs if values are not 0
-    df_consolidated["log_num_commits"] = df_consolidated["Commit"].apply(lambda x: math.log(x) if x != 0 else 0)
-    df_consolidated["log_num_contrib"] = df_consolidated["num_unique_contributors"].apply(
-        lambda x: math.log(x) if x != 0 else 0
+    df_consolidated = df_consolidated.fillna(value=0)
+
+    # log of commits and contribs if values are not 0 (vectorized with np.log)
+    df_consolidated["log_num_commits"] = np.where(df_consolidated["Commit"] != 0, np.log(df_consolidated["Commit"]), 0)
+    df_consolidated["log_num_contrib"] = np.where(
+        df_consolidated["num_unique_contributors"] != 0,
+        np.log(df_consolidated["num_unique_contributors"]),
+        0,
     )
 
     # column to hold the weighted values of pr and issues actions summed together
@@ -370,10 +372,10 @@ def process_data(
     )
 
     # after weighting replace 0 with nan for log
-    df_consolidated["prs_issues_actions_weighted"].replace(0, np.nan, inplace=True)
+    df_consolidated["prs_issues_actions_weighted"] = df_consolidated["prs_issues_actions_weighted"].replace(0, np.nan)
 
-    # column for log value of pr and issue actions
-    df_consolidated["log_prs_issues_actions_weighted"] = df_consolidated["prs_issues_actions_weighted"].apply(math.log)
+    # column for log value of pr and issue actions (vectorized)
+    df_consolidated["log_prs_issues_actions_weighted"] = np.log(df_consolidated["prs_issues_actions_weighted"])
 
     return df_consolidated
 

diff --git a/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py b/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
@@ -191,12 +191,8 @@ def directory_dropdown(repo_id):
     df = df[df["rl_analysis_date"] == df["rl_analysis_date"].max()]
 
     # drop unneccessary columns not needed after preprocessing steps
-    df = df.reset_index()
-    df.drop(
-        ["index", "repo_id", "repo_name", "repo_path", "rl_analysis_date"],
-        axis=1,
-        inplace=True,
-    )
+    df = df.reset_index(drop=True)
+    df = df.drop(columns=["repo_id", "repo_name", "repo_path", "rl_analysis_date"])
 
     # split file path by directory
     df = df.join(df["file_path"].str.split("/", expand=True))
@@ -375,33 +371,31 @@ def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch
     df_file["file_path"] = df_file["file_path"].str.rsplit(path_slice, n=1).str[1]
 
     # drop unneccessary columns not needed after preprocessing steps
-    df_file = df_file.reset_index()
-    df_file.drop(["index", "repo_name", "repo_path", "rl_analysis_date"], axis=1, inplace=True)
+    df_file = df_file.reset_index(drop=True)
+    df_file = df_file.drop(columns=["repo_name", "repo_path", "rl_analysis_date"])
 
     # split file path by directory
     df_file = df_file.join(df_file["file_path"].str.split("/", expand=True))
 
     # drop unnecessary columns
-    df_file.drop(["repo_id"], axis=1, inplace=True)
-    df_file_cntbs.drop(["repo_id", "reviewer_ids"], axis=1, inplace=True)
+    df_file = df_file.drop(columns=["repo_id"])
+    df_file_cntbs = df_file_cntbs.drop(columns=["repo_id", "reviewer_ids"])
 
     # Left join on df_files to only get the files that are currently in the repository
     # and the contributors that have ever reviewed a pr that included edits on the file
     df_file = pd.merge(df_file, df_file_cntbs, on="file_path", how="left")
     # replace nan with empty string to avoid errors in list comprehension
-    df_file.cntrb_ids.fillna("", inplace=True)
+    df_file["cntrb_ids"] = df_file["cntrb_ids"].fillna("")
 
     # reformat cntrb_ids to list and remove bots if filter is on
+    # Vectorized: cntrb_ids is already a list after the fillna, so we convert strings to lists
     if bot_switch:
-        df_file["cntrb_ids"] = df_file.apply(
-            lambda row: [x for x in row.cntrb_ids if x not in app.bots_list],
-            axis=1,
+        bots_set = set(app.bots_list)
+        df_file["cntrb_ids"] = df_file["cntrb_ids"].apply(
+            lambda ids: [x for x in ids if x not in bots_set] if isinstance(ids, list) else []
         )
     else:
-        df_file["cntrb_ids"] = df_file.apply(
-            lambda row: [x for x in row.cntrb_ids],
-            axis=1,
-        )
+        df_file["cntrb_ids"] = df_file["cntrb_ids"].apply(lambda ids: list(ids) if isinstance(ids, list) else [])
 
     return df_file
 
@@ -453,10 +447,8 @@ def cntrb_per_directory_value(directory, df_file):
     )
 
     # Set of cntrb_ids to confirm there are no duplicate cntrb_ids
-    df_dynamic_directory["cntrb_ids"] = df_dynamic_directory.apply(
-        lambda row: set(row.cntrb_ids),
-        axis=1,
-    )
+    # Vectorized: use list comprehension instead of apply for simple set conversion
+    df_dynamic_directory["cntrb_ids"] = [set(ids) for ids in df_dynamic_directory["cntrb_ids"]]
     return df_dynamic_directory
 
 
@@ -485,21 +477,15 @@ def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.Da
     df_actions = df_actions.drop_duplicates(subset="cntrb_id", keep="first")
 
     # drop unneccessary columns not needed after preprocessing steps
-    df_actions = df_actions.reset_index()
-    df_actions.drop(
-        ["index", "repo_id", "repo_name", "login", "Action", "rank"],
-        axis=1,
-        inplace=True,
-    )
+    df_actions = df_actions.reset_index(drop=True)
+    df_actions = df_actions.drop(columns=["repo_id", "repo_name", "login", "Action", "rank"])
 
     # dictionary of cntrb_ids and their most recent activity on repo
     last_contrb = df_actions.set_index("cntrb_id")["created_at"].to_dict()
 
     # get list of dates of the most recent activity for each contributor for each file
-    df_dynamic_directory["dates"] = df_dynamic_directory.apply(
-        lambda row: [last_contrb[x] for x in row.cntrb_ids],
-        axis=1,
-    )
+    # Vectorized: use list comprehension instead of apply
+    df_dynamic_directory["dates"] = [[last_contrb.get(x) for x in ids] for ids in df_dynamic_directory["cntrb_ids"]]
 
     # reformat into each row being a directory value and a date of one of the contributors
     # most recent activity - preprocessing step
@@ -549,7 +535,7 @@ def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions:
     final = final.groupby(pd.Grouper(key="dates", freq="1M"))["directory_value"].value_counts().unstack(0)
 
     # removing the None row that was used for column formating
-    final.drop("nan", inplace=True)
+    final = final.drop(index="nan")
 
     # add back the files that had no contributors
     for files in no_contribs:

diff --git a/8Knot/pages/codebase/visualizations/contribution_file_heatmap.py b/8Knot/pages/codebase/visualizations/contribution_file_heatmap.py
@@ -204,12 +204,8 @@ def directory_dropdown(repo_id):
     df["file_path"] = df["file_path"].str.rsplit(path_slice, n=1).str[1]
 
     # drop unneccessary columns not needed after preprocessing steps
-    df = df.reset_index()
-    df.drop(
-        ["index", "repo_id", "repo_name", "repo_path", "rl_analysis_date"],
-        axis=1,
-        inplace=True,
-    )
+    df = df.reset_index(drop=True)
+    df = df.drop(columns=["repo_id", "repo_name", "repo_path", "rl_analysis_date"])
 
     # split file path by directory
     df = df.join(df["file_path"].str.split("/", expand=True))
@@ -383,15 +379,15 @@ def df_file_clean(df_file: pd.DataFrame, df_file_pr: pd.DataFrame):
     df_file["file_path"] = df_file["file_path"].str.rsplit(path_slice, n=1).str[1]
 
     # drop unneccessary columns not needed after preprocessing steps
-    df_file = df_file.reset_index()
-    df_file.drop(["index", "repo_name", "repo_path", "rl_analysis_date"], axis=1, inplace=True)
+    df_file = df_file.reset_index(drop=True)
+    df_file = df_file.drop(columns=["repo_name", "repo_path", "rl_analysis_date"])
 
     # split file path by directory
     df_file = df_file.join(df_file["file_path"].str.split("/", expand=True))
 
     # drop unnecessary columns
-    df_file.drop(["repo_id"], axis=1, inplace=True)
-    df_file_pr.drop(["repo_id"], axis=1, inplace=True)
+    df_file = df_file.drop(columns=["repo_id"])
+    df_file_pr = df_file_pr.drop(columns=["repo_id"])
 
     # create column with list of prs per file path
     df_file_pr = df_file_pr.groupby("file_path")["pull_request_id"].apply(list)
@@ -449,10 +445,8 @@ def pr_per_directory_value(directory, df_file):
     df_dynamic_directory.loc[df_dynamic_directory.pull_request_id == 0, "pull_request_id"] = ""
 
     # Set of pull_request to confirm there are no duplicate pull requests
-    df_dynamic_directory["pull_request_id"] = df_dynamic_directory.apply(
-        lambda row: set(row.pull_request_id),
-        axis=1,
-    )
+    # Vectorized: use list comprehension instead of apply for simple set conversion
+    df_dynamic_directory["pull_request_id"] = [set(ids) for ids in df_dynamic_directory["pull_request_id"]]
     return df_dynamic_directory
 
 
@@ -480,26 +474,21 @@ def pr_to_dates(df_pr: pd.DataFrame, df_dynamic_directory: pd.DataFrame, graph_v
     df_pr["merged_at"] = pd.to_datetime(df_pr["merged_at"], utc=True)
 
     # drop unneccessary columns not needed after preprocessing steps
-    df_pr.drop(
-        ["repo_id", "repo_name", "pr_src_number", "cntrb_id", "closed_at"],
-        axis=1,
-        inplace=True,
-    )
+    df_pr = df_pr.drop(columns=["repo_id", "repo_name", "pr_src_number", "cntrb_id", "closed_at"])
 
     # dictionaries of pull_requests and their open and merge dates
     pr_open = df_pr.set_index("pull_request_id")["created_at"].to_dict()
     pr_merged = df_pr.set_index("pull_request_id")["merged_at"].to_dict()
 
     # get list of pr created and merged dates for each pr
-    df_dynamic_directory["created_at"], df_dynamic_directory["merged_at"] = zip(
-        *df_dynamic_directory.apply(
-            lambda row: [
-                [pr_open[x] for x in row.pull_request_id],
-                [pr_merged[x] for x in row.pull_request_id if (not pd.isnull(pr_merged[x]))],
-            ],
-            axis=1,
-        )
-    )
+    # Vectorized: use list comprehension instead of apply
+    created_at_list = [[pr_open.get(x) for x in ids] for ids in df_dynamic_directory["pull_request_id"]]
+    merged_at_list = [
+        [pr_merged.get(x) for x in ids if not pd.isnull(pr_merged.get(x))]
+        for ids in df_dynamic_directory["pull_request_id"]
+    ]
+    df_dynamic_directory["created_at"] = created_at_list
+    df_dynamic_directory["merged_at"] = merged_at_list
 
     # reformat into each row being a directory value and a date of one of the pull request dates
     df_dynamic_directory = df_dynamic_directory.explode(graph_view)
@@ -548,7 +537,7 @@ def file_pr_activity_by_month(df_dynamic_directory: pd.DataFrame, df_pr: pd.Data
 
     # removing the None row that was used for column formating if exists
     if "nan" in final.index:
-        final.drop("nan", inplace=True)
+        final = final.drop(index="nan")
 
     # add back the files that had no pull requests
     for files in no_contribs:

diff --git a/8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py b/8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py
@@ -191,12 +191,8 @@ def directory_dropdown(repo_id):
     df = df[df["rl_analysis_date"] == df["rl_analysis_date"].max()]
 
     # drop unneccessary columns not needed after preprocessing steps
-    df = df.reset_index()
-    df.drop(
-        ["index", "repo_id", "repo_name", "repo_path", "rl_analysis_date"],
-        axis=1,
-        inplace=True,
-    )
+    df = df.reset_index(drop=True)
+    df = df.drop(columns=["repo_id", "repo_name", "repo_path", "rl_analysis_date"])
 
     # split file path by directory
     df = df.join(df["file_path"].str.split("/", expand=True))
@@ -375,33 +371,31 @@ def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch
     df_file["file_path"] = df_file["file_path"].str.rsplit(path_slice, n=1).str[1]
 
     # drop unneccessary columns not needed after preprocessing steps
-    df_file = df_file.reset_index()
-    df_file.drop(["index", "repo_name", "repo_path", "rl_analysis_date"], axis=1, inplace=True)
+    df_file = df_file.reset_index(drop=True)
+    df_file = df_file.drop(columns=["repo_name", "repo_path", "rl_analysis_date"])
 
     # split file path by directory
     df_file = df_file.join(df_file["file_path"].str.split("/", expand=True))
 
     # drop unnecessary columns
-    df_file.drop(["repo_id"], axis=1, inplace=True)
-    df_file_cntbs.drop(["repo_id", "cntrb_ids"], axis=1, inplace=True)
+    df_file = df_file.drop(columns=["repo_id"])
+    df_file_cntbs = df_file_cntbs.drop(columns=["repo_id", "cntrb_ids"])
 
     # Left join on df_files to only get the files that are currently in the repository
     # and the contributors that have ever reviewed a pr that included edits on the file
     df_file = pd.merge(df_file, df_file_cntbs, on="file_path", how="left")
     # replace nan with empty string to avoid errors in list comprehension
-    df_file.reviewer_ids.fillna("", inplace=True)
+    df_file["reviewer_ids"] = df_file["reviewer_ids"].fillna("")
 
     # reformat reviewer_ids to list and remove bots if filter is on
+    # Vectorized: use set for O(1) lookup instead of list
     if bot_switch:
-        df_file["reviewer_ids"] = df_file.apply(
-            lambda row: [x for x in row.reviewer_ids if x not in app.bots_list],
-            axis=1,
+        bots_set = set(app.bots_list)
+        df_file["reviewer_ids"] = df_file["reviewer_ids"].apply(
+            lambda ids: [x for x in ids if x not in bots_set] if isinstance(ids, list) else []
         )
     else:
-        df_file["reviewer_ids"] = df_file.apply(
-            lambda row: [x for x in row.reviewer_ids],
-            axis=1,
-        )
+        df_file["reviewer_ids"] = df_file["reviewer_ids"].apply(lambda ids: list(ids) if isinstance(ids, list) else [])
     return df_file
 
 
@@ -452,10 +446,8 @@ def cntrb_per_directory_value(directory, df_file):
     )
 
     # Set of reviewer_ids to confirm there are no duplicate reviewer_ids
-    df_dynamic_directory["reviewer_ids"] = df_dynamic_directory.apply(
-        lambda row: set(row.reviewer_ids),
-        axis=1,
-    )
+    # Vectorized: use list comprehension instead of apply for simple set conversion
+    df_dynamic_directory["reviewer_ids"] = [set(ids) for ids in df_dynamic_directory["reviewer_ids"]]
     return df_dynamic_directory
 
 
@@ -484,21 +476,15 @@ def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.Da
     df_actions = df_actions.drop_duplicates(subset="cntrb_id", keep="first")
 
     # drop unneccessary columns not needed after preprocessing steps
-    df_actions = df_actions.reset_index()
-    df_actions.drop(
-        ["index", "repo_id", "repo_name", "login", "Action", "rank"],
-        axis=1,
-        inplace=True,
-    )
+    df_actions = df_actions.reset_index(drop=True)
+    df_actions = df_actions.drop(columns=["repo_id", "repo_name", "login", "Action", "rank"])
 
     # dictionary of reviewer_ids and their most recent activity on repo
     last_contrb = df_actions.set_index("cntrb_id")["created_at"].to_dict()
 
     # get list of dates of the most recent activity for each contributor for each file
-    df_dynamic_directory["dates"] = df_dynamic_directory.apply(
-        lambda row: [last_contrb[x] for x in row.reviewer_ids],
-        axis=1,
-    )
+    # Vectorized: use list comprehension instead of apply
+    df_dynamic_directory["dates"] = [[last_contrb.get(x) for x in ids] for ids in df_dynamic_directory["reviewer_ids"]]
 
     # reformat into each row being a directory value and a date of one of the contributors
     # most recent activity - preprocessing step
@@ -548,7 +534,7 @@ def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions:
     final = final.groupby(pd.Grouper(key="dates", freq="1M"))["directory_value"].value_counts().unstack(0)
 
     # removing the None row that was used for column formating
-    final.drop("nan", inplace=True)
+    final = final.drop(index="nan")
 
     # add back the files that had no contributors
     for files in no_contribs:

diff --git a/8Knot/pages/contributions/visualizations/commits_over_time.py b/8Knot/pages/contributions/visualizations/commits_over_time.py
@@ -163,7 +163,7 @@ def process_data(df: pd.DataFrame, interval):
     # convert to datetime objects with consistent column name
     # incoming value should be a posix integer.
     df["author_date"] = pd.to_datetime(df["author_date"], utc=True)
-    df.rename(columns={"author_date": "created_at"}, inplace=True)
+    df = df.rename(columns={"author_date": "created_at"})
 
     # variable to slice on to handle weekly period edge case
     period_slice = None