oss-aspen · EngCaioFonseca · Dec 13, 2025 · Dec 13, 2025 · Dec 19, 2025
diff --git a/8Knot/pages/contributors/visualizations/contrib_activity_cycle.py b/8Knot/pages/contributors/visualizations/contrib_activity_cycle.py
@@ -4,10 +4,12 @@
 from dash.dependencies import Input, Output, State
 import plotly.graph_objects as go
 import pandas as pd
+import polars as pl
 import logging
 from dateutil.relativedelta import *  # type: ignore
 import plotly.express as px
 from pages.utils.graph_utils import baby_blue
+from pages.utils.polars_utils import to_polars, to_pandas
 from queries.commits_query import commits_query as cmq
 import cache_manager.cache_facade as cf
 from pages.utils.job_utils import nodata_graph
@@ -156,36 +158,65 @@ def contrib_activity_cycle_graph(repolist, interval):
 
 
 def process_data(df: pd.DataFrame, interval):
-    # for this usecase we want the datetimes to be in their local values
-    # tricking pandas to keep local values when UTC conversion is required for to_datetime
-    df["author_timestamp"] = df["author_timestamp"].astype("str").str[:-6]
-    df["committer_timestamp"] = df["committer_timestamp"].astype("str").str[:-6]
-
-    # convert to datetime objects rather than strings
-    df["author_timestamp"] = pd.to_datetime(df["author_timestamp"], utc=True)
-    df["committer_timestamp"] = pd.to_datetime(df["committer_timestamp"], utc=True)
-    # removes duplicate values when the author and committer is the same
-    df.loc[df["author_timestamp"] == df["committer_timestamp"], "author_timestamp"] = None
+    """
+    Process contributor activity cycle data using Polars for performance.
+
+    Follows the "Polars Core, Pandas Edge" architecture.
+    """
+    # === POLARS PROCESSING START ===
+
+    # Convert to Polars for fast processing
+    pl_df = to_polars(df)
+
+    # Convert string timestamps to datetime, stripping timezone offset
+    pl_df = pl_df.with_columns(
+        [
+            pl.col("author_timestamp").cast(pl.Utf8).str.slice(0, -6).str.to_datetime().alias("author_timestamp"),
+            pl.col("committer_timestamp").cast(pl.Utf8).str.slice(0, -6).str.to_datetime().alias("committer_timestamp"),
+        ]
+    )
 
-    df_final = pd.DataFrame()
+    # Remove duplicate values when author and committer are the same
+    pl_df = pl_df.with_columns(
+        pl.when(pl.col("author_timestamp") == pl.col("committer_timestamp"))
+        .then(None)
+        .otherwise(pl.col("author_timestamp"))
+        .alias("author_timestamp")
+    )
 
     if interval == "H":
-        # combine the hour values for author and committer
-        hour = pd.concat([df["author_timestamp"].dt.hour, df["committer_timestamp"].dt.hour])
-        df_hour = pd.DataFrame(hour, columns=["Hour"])
-        df_final = df_hour.groupby(["Hour"])["Hour"].count()
+        # Extract hour values and combine
+        author_hours = pl_df.select(pl.col("author_timestamp").dt.hour().alias("Hour")).drop_nulls()
+        committer_hours = pl_df.select(pl.col("committer_timestamp").dt.hour().alias("Hour")).drop_nulls()
+        combined = pl.concat([author_hours, committer_hours])
+        pl_result = combined.group_by("Hour").agg(pl.len().alias("Hour")).sort("Hour")
     else:
-        # combine the weekday values for author and committer
-        weekday = pd.concat(
-            [
-                df["author_timestamp"].dt.day_name(),
-                df["committer_timestamp"].dt.day_name(),
-            ]
+        # Extract weekday names and combine
+        # Polars uses 1-7 for weekdays, we need to map to names
+        weekday_map = {
+            1: "Monday",
+            2: "Tuesday",
+            3: "Wednesday",
+            4: "Thursday",
+            5: "Friday",
+            6: "Saturday",
+            7: "Sunday",
+        }
+        author_weekdays = pl_df.select(pl.col("author_timestamp").dt.weekday().alias("day_num")).drop_nulls()
+        committer_weekdays = pl_df.select(pl.col("committer_timestamp").dt.weekday().alias("day_num")).drop_nulls()
+        combined = pl.concat([author_weekdays, committer_weekdays])
+
+        # Map day numbers to names
+        combined = combined.with_columns(
+            pl.col("day_num").replace_strict(weekday_map, default="Unknown").alias("Weekday")
         )
-        df_weekday = pd.DataFrame(weekday, columns=["Weekday"])
-        df_final = df_weekday.groupby(["Weekday"])["Weekday"].count()
+        pl_result = combined.group_by("Weekday").agg(pl.len().alias("Weekday")).sort("Weekday")
+
+    # === POLARS PROCESSING END ===
 
-    return df_final
+    # Convert to Pandas Series for compatibility with existing create_figure
+    result_df = to_pandas(pl_result)
+    return result_df.set_index(result_df.columns[0])[result_df.columns[1]]
 
 
 def create_figure(df: pd.DataFrame, interval):

diff --git a/8Knot/pages/contributors/visualizations/contrib_drive_repeat.py b/8Knot/pages/contributors/visualizations/contrib_drive_repeat.py
@@ -4,9 +4,11 @@
 from dash import callback
 from dash.dependencies import Input, Output, State
 import pandas as pd
+import polars as pl
 import logging
 import plotly.express as px
 from pages.utils.graph_utils import baby_blue
+from pages.utils.polars_utils import to_polars, to_pandas
 from pages.utils.job_utils import nodata_graph
 from queries.contributors_query import contributors_query as ctq
 import time
@@ -210,24 +212,33 @@ def repeat_drive_by_graph(repolist, contribs, view, bot_switch):
 
 
 def process_data(df, view, contribs):
-    # convert to datetime objects with consistent column name
-    df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
-    # df.rename(columns={"created_at": "created"}, inplace=True)
+    """
+    Process contributor drive/repeat data using Polars for performance.
 
-    # graph on contribution subset
-    contributors = df["cntrb_id"][df["rank"] == contribs].to_list()
-    df_cont_subset = pd.DataFrame(df)
+    Follows the "Polars Core, Pandas Edge" architecture.
+    """
+    # === POLARS PROCESSING START ===
 
-    # filtering data by view
+    # Convert to Polars for fast processing
+    pl_df = to_polars(df)
+
+    # Convert to datetime
+    pl_df = pl_df.with_columns(pl.col("created_at").cast(pl.Datetime("us", "UTC")))
+
+    # Get contributors with specified rank
+    contributors = pl_df.filter(pl.col("rank") == contribs).select("cntrb_id").unique().to_series().to_list()
+    contributors_set = set(contributors)
+
+    # Filter based on view
     if view == "drive":
-        df_cont_subset = df_cont_subset.loc[~df_cont_subset["cntrb_id"].isin(contributors)]
+        pl_result = pl_df.filter(~pl.col("cntrb_id").is_in(contributors_set))
     else:
-        df_cont_subset = df_cont_subset.loc[df_cont_subset["cntrb_id"].isin(contributors)]
+        pl_result = pl_df.filter(pl.col("cntrb_id").is_in(contributors_set))
 
-    # reset index to be ready for plotly
-    df_cont_subset = df_cont_subset.reset_index()
+    # === POLARS PROCESSING END ===
 
-    return df_cont_subset
+    # Convert to Pandas for visualization
+    return to_pandas(pl_result)
 
 
 def create_figure(df_cont_subset):

diff --git a/8Knot/pages/contributors/visualizations/contrib_importance_pie.py b/8Knot/pages/contributors/visualizations/contrib_importance_pie.py
@@ -6,10 +6,12 @@
 from dash.dependencies import Input, Output, State
 import plotly.graph_objects as go
 import pandas as pd
+import polars as pl
 import logging
 from dateutil.relativedelta import *  # type: ignore
 import plotly.express as px
 from pages.utils.graph_utils import get_graph_time_values, baby_blue
+from pages.utils.polars_utils import to_polars, to_pandas
 from queries.contributors_query import contributors_query as ctq
 from pages.utils.job_utils import nodata_graph
 import time
@@ -253,51 +255,47 @@ def create_top_k_cntrbs_graph(repolist, action_type, top_k, start_date, end_date
 
 
 def process_data(df: pd.DataFrame, action_type, top_k, start_date, end_date):
-    # convert to datetime objects rather than strings
-    df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
+    """
+    Process contributor importance pie data using Polars for performance.
 
-    # order values chronologically by created_at date
-    df = df.sort_values(by="created_at", ascending=True)
+    Follows the "Polars Core, Pandas Edge" architecture.
+    """
+    # === POLARS PROCESSING START ===
 
-    # filter values based on date picker
-    if start_date is not None:
-        df = df[df.created_at >= start_date]
-    if end_date is not None:
-        df = df[df.created_at <= end_date]
-
-    # subset the df such that it only contains rows where the Action column value is the action type
-    df = df[df["Action"].str.contains(action_type)]
+    # Convert to Polars for fast processing
+    pl_df = to_polars(df)
 
-    # get the number of total contributions of the specific action type
-    t_sum = df.shape[0]
+    # Convert to datetime and sort
+    pl_df = pl_df.with_columns(pl.col("created_at").cast(pl.Datetime("us", "UTC")))
+    pl_df = pl_df.sort("created_at")
 
-    # count the number of contributions for each contributor
-    df = (df.groupby("cntrb_id")["Action"].count()).to_frame()
-
-    # sort rows according to amount of contributions from greatest to least
-    df = df.sort_values(by="Action", ascending=False)
+    # Filter by date range
+    if start_date is not None:
+        pl_df = pl_df.filter(pl.col("created_at") >= start_date)
+    if end_date is not None:
+        pl_df = pl_df.filter(pl.col("created_at") <= end_date)
 
-    df = df.reset_index()
+    # Filter by action type
+    pl_df = pl_df.filter(pl.col("Action").str.contains(action_type))
 
-    # rename Action column to action_type
-    df = df.rename(columns={"Action": action_type})
+    # Count contributions per contributor
+    pl_grouped = pl_df.group_by("cntrb_id").agg(pl.len().alias(action_type)).sort(action_type, descending=True)
 
-    # get the number of total contributions
-    t_sum = df[action_type].sum()
+    # Get total sum
+    t_sum = pl_grouped.select(pl.col(action_type).sum()).item()
 
-    # index df to get first k rows
-    df = df.head(top_k)
+    # Get top k
+    pl_top_k = pl_grouped.head(top_k)
+    df_sum = pl_top_k.select(pl.col(action_type).sum()).item()
 
-    # get the number of total top k contributions
-    df_sum = df[action_type].sum()
+    # Add "Other" row for remaining contributions
+    other_row = pl.DataFrame({"cntrb_id": ["Other"], action_type: [t_sum - df_sum]})
+    pl_result = pl.concat([pl_top_k, other_row])
 
-    # calculate the remaining contributions by taking the the difference of t_sum and df_sum
-    # dataframes no longer implement above 'append' interface as of Pandas 1.4.4
-    # create a single-entry dataframe that we can concatenate onto existing df
-    df_concat = pd.DataFrame(data={"cntrb_id": ["Other"], action_type: [t_sum - df_sum]})
-    df = pd.concat([df, df_concat], ignore_index=True)
+    # === POLARS PROCESSING END ===
 
-    return df
+    # Convert to Pandas for visualization
+    return to_pandas(pl_result)
 
 
 def create_figure(df: pd.DataFrame, action_type):

diff --git a/8Knot/pages/contributors/visualizations/contribs_by_action.py b/8Knot/pages/contributors/visualizations/contribs_by_action.py
@@ -4,10 +4,12 @@
 from dash.dependencies import Input, Output, State
 import plotly.graph_objects as go
 import pandas as pd
+import polars as pl
 import logging
 from dateutil.relativedelta import *  # type: ignore
 import plotly.express as px
 from pages.utils.graph_utils import get_graph_time_values, baby_blue
+from pages.utils.polars_utils import to_polars, to_pandas
 from queries.contributors_query import contributors_query as ctq
 from pages.utils.job_utils import nodata_graph
 import time
@@ -221,32 +223,38 @@ def contribs_by_action_graph(repolist, interval, action, bot_switch):
 
 
 def process_data(df: pd.DataFrame, interval, action):
-    # convert to datetime objects rather than strings
-    df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
+    """
+    Process contributors by action data using Polars for performance.
 
-    # order values chronologically by COLUMN_TO_SORT_BY date
-    df = df.sort_values(by="created_at", axis=0, ascending=True)
+    Follows the "Polars Core, Pandas Edge" architecture.
+    """
+    # === POLARS PROCESSING START ===
 
-    # drop all contributions that are not the selected action
-    df = df[df["Action"].str.contains(action)]
+    # Convert to Polars for fast processing
+    pl_df = to_polars(df)
 
-    # For distinct contributors per interval: keep one row per (cntrb_id, interval)
-    """df["_period"] = df["created_at"].dt.to_period(interval)
-    df = df.drop_duplicates(subset=["cntrb_id", "_period"], keep="first")
-    # Use the start of the interval for plotting consistency
-    df["created_at"] = df["_period"].dt.start_time
-    df = df.drop(columns=["_period"])  # cleanup"""
+    # Convert to datetime and sort
+    pl_df = pl_df.with_columns(pl.col("created_at").cast(pl.Datetime("us", "UTC")))
+    pl_df = pl_df.sort("created_at")
 
-    freq_map = {"M1": "M", "M3": "Q", "M6": "2Q", "M12": "Y"}
-    pandas_freq = freq_map.get(interval, interval)
+    # Filter for selected action using Polars string contains
+    pl_df = pl_df.filter(pl.col("Action").str.contains(action))
 
-    df["_period"] = df["created_at"].dt.to_period(pandas_freq)
-    df = df.drop_duplicates(subset=["cntrb_id", "_period"], keep="first")
-    df["created_at"] = df["_period"].dt.start_time
-    df = df.drop(columns=["_period"])
-    print(df)
+    # Map interval to Polars truncation format
+    interval_map = {"M1": "1mo", "M3": "3mo", "M6": "6mo", "M12": "1y"}
+    polars_interval = interval_map.get(interval, "1mo")
 
-    return df
+    # Add period column and dedupe per contributor per period
+    pl_df = pl_df.with_columns(pl.col("created_at").dt.truncate(polars_interval).alias("_period"))
+    pl_df = pl_df.unique(subset=["cntrb_id", "_period"], keep="first")
+
+    # Update created_at to period start time
+    pl_df = pl_df.with_columns(pl.col("_period").alias("created_at")).drop("_period")
+
+    # === POLARS PROCESSING END ===
+
+    # Convert to Pandas for visualization
+    return to_pandas(pl_df)
 
 
 def create_figure(df: pd.DataFrame, interval, action):