Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 55 additions & 24 deletions 8Knot/pages/contributors/visualizations/contrib_activity_cycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from dash.dependencies import Input, Output, State
import plotly.graph_objects as go
import pandas as pd
import polars as pl
import logging
from dateutil.relativedelta import * # type: ignore
import plotly.express as px
from pages.utils.graph_utils import baby_blue
from pages.utils.polars_utils import to_polars, to_pandas
from queries.commits_query import commits_query as cmq
import cache_manager.cache_facade as cf
from pages.utils.job_utils import nodata_graph
Expand Down Expand Up @@ -156,36 +158,65 @@ def contrib_activity_cycle_graph(repolist, interval):


def process_data(df: pd.DataFrame, interval):
# for this usecase we want the datetimes to be in their local values
# tricking pandas to keep local values when UTC conversion is required for to_datetime
df["author_timestamp"] = df["author_timestamp"].astype("str").str[:-6]
df["committer_timestamp"] = df["committer_timestamp"].astype("str").str[:-6]

# convert to datetime objects rather than strings
df["author_timestamp"] = pd.to_datetime(df["author_timestamp"], utc=True)
df["committer_timestamp"] = pd.to_datetime(df["committer_timestamp"], utc=True)
# removes duplicate values when the author and committer is the same
df.loc[df["author_timestamp"] == df["committer_timestamp"], "author_timestamp"] = None
"""
Process contributor activity cycle data using Polars for performance.

Follows the "Polars Core, Pandas Edge" architecture.
"""
# === POLARS PROCESSING START ===

# Convert to Polars for fast processing
pl_df = to_polars(df)

# Convert string timestamps to datetime, stripping timezone offset
pl_df = pl_df.with_columns(
[
pl.col("author_timestamp").cast(pl.Utf8).str.slice(0, -6).str.to_datetime().alias("author_timestamp"),
pl.col("committer_timestamp").cast(pl.Utf8).str.slice(0, -6).str.to_datetime().alias("committer_timestamp"),
]
)

df_final = pd.DataFrame()
# Remove duplicate values when author and committer are the same
pl_df = pl_df.with_columns(
pl.when(pl.col("author_timestamp") == pl.col("committer_timestamp"))
.then(None)
.otherwise(pl.col("author_timestamp"))
.alias("author_timestamp")
)

if interval == "H":
# combine the hour values for author and committer
hour = pd.concat([df["author_timestamp"].dt.hour, df["committer_timestamp"].dt.hour])
df_hour = pd.DataFrame(hour, columns=["Hour"])
df_final = df_hour.groupby(["Hour"])["Hour"].count()
# Extract hour values and combine
author_hours = pl_df.select(pl.col("author_timestamp").dt.hour().alias("Hour")).drop_nulls()
committer_hours = pl_df.select(pl.col("committer_timestamp").dt.hour().alias("Hour")).drop_nulls()
combined = pl.concat([author_hours, committer_hours])
pl_result = combined.group_by("Hour").agg(pl.len().alias("Hour")).sort("Hour")
else:
# combine the weekday values for author and committer
weekday = pd.concat(
[
df["author_timestamp"].dt.day_name(),
df["committer_timestamp"].dt.day_name(),
]
# Extract weekday names and combine
# Polars uses 1-7 for weekdays, we need to map to names
weekday_map = {
1: "Monday",
2: "Tuesday",
3: "Wednesday",
4: "Thursday",
5: "Friday",
6: "Saturday",
7: "Sunday",
}
author_weekdays = pl_df.select(pl.col("author_timestamp").dt.weekday().alias("day_num")).drop_nulls()
committer_weekdays = pl_df.select(pl.col("committer_timestamp").dt.weekday().alias("day_num")).drop_nulls()
combined = pl.concat([author_weekdays, committer_weekdays])

# Map day numbers to names
combined = combined.with_columns(
pl.col("day_num").replace_strict(weekday_map, default="Unknown").alias("Weekday")
)
df_weekday = pd.DataFrame(weekday, columns=["Weekday"])
df_final = df_weekday.groupby(["Weekday"])["Weekday"].count()
pl_result = combined.group_by("Weekday").agg(pl.len().alias("Weekday")).sort("Weekday")

# === POLARS PROCESSING END ===

return df_final
# Convert to Pandas Series for compatibility with existing create_figure
result_df = to_pandas(pl_result)
return result_df.set_index(result_df.columns[0])[result_df.columns[1]]


def create_figure(df: pd.DataFrame, interval):
Expand Down
35 changes: 23 additions & 12 deletions 8Knot/pages/contributors/visualizations/contrib_drive_repeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from dash import callback
from dash.dependencies import Input, Output, State
import pandas as pd
import polars as pl
import logging
import plotly.express as px
from pages.utils.graph_utils import baby_blue
from pages.utils.polars_utils import to_polars, to_pandas
from pages.utils.job_utils import nodata_graph
from queries.contributors_query import contributors_query as ctq
import time
Expand Down Expand Up @@ -210,24 +212,33 @@ def repeat_drive_by_graph(repolist, contribs, view, bot_switch):


def process_data(df, view, contribs):
# convert to datetime objects with consistent column name
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
# df.rename(columns={"created_at": "created"}, inplace=True)
"""
Process contributor drive/repeat data using Polars for performance.

# graph on contribution subset
contributors = df["cntrb_id"][df["rank"] == contribs].to_list()
df_cont_subset = pd.DataFrame(df)
Follows the "Polars Core, Pandas Edge" architecture.
"""
# === POLARS PROCESSING START ===

# filtering data by view
# Convert to Polars for fast processing
pl_df = to_polars(df)

# Convert to datetime
pl_df = pl_df.with_columns(pl.col("created_at").cast(pl.Datetime("us", "UTC")))

# Get contributors with specified rank
contributors = pl_df.filter(pl.col("rank") == contribs).select("cntrb_id").unique().to_series().to_list()
contributors_set = set(contributors)

# Filter based on view
if view == "drive":
df_cont_subset = df_cont_subset.loc[~df_cont_subset["cntrb_id"].isin(contributors)]
pl_result = pl_df.filter(~pl.col("cntrb_id").is_in(contributors_set))
else:
df_cont_subset = df_cont_subset.loc[df_cont_subset["cntrb_id"].isin(contributors)]
pl_result = pl_df.filter(pl.col("cntrb_id").is_in(contributors_set))

# reset index to be ready for plotly
df_cont_subset = df_cont_subset.reset_index()
# === POLARS PROCESSING END ===

return df_cont_subset
# Convert to Pandas for visualization
return to_pandas(pl_result)


def create_figure(df_cont_subset):
Expand Down
66 changes: 32 additions & 34 deletions 8Knot/pages/contributors/visualizations/contrib_importance_pie.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
from dash.dependencies import Input, Output, State
import plotly.graph_objects as go
import pandas as pd
import polars as pl
import logging
from dateutil.relativedelta import * # type: ignore
import plotly.express as px
from pages.utils.graph_utils import get_graph_time_values, baby_blue
from pages.utils.polars_utils import to_polars, to_pandas
from queries.contributors_query import contributors_query as ctq
from pages.utils.job_utils import nodata_graph
import time
Expand Down Expand Up @@ -253,51 +255,47 @@ def create_top_k_cntrbs_graph(repolist, action_type, top_k, start_date, end_date


def process_data(df: pd.DataFrame, action_type, top_k, start_date, end_date):
# convert to datetime objects rather than strings
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
"""
Process contributor importance pie data using Polars for performance.

# order values chronologically by created_at date
df = df.sort_values(by="created_at", ascending=True)
Follows the "Polars Core, Pandas Edge" architecture.
"""
# === POLARS PROCESSING START ===

# filter values based on date picker
if start_date is not None:
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created_at <= end_date]

# subset the df such that it only contains rows where the Action column value is the action type
df = df[df["Action"].str.contains(action_type)]
# Convert to Polars for fast processing
pl_df = to_polars(df)

# get the number of total contributions of the specific action type
t_sum = df.shape[0]
# Convert to datetime and sort
pl_df = pl_df.with_columns(pl.col("created_at").cast(pl.Datetime("us", "UTC")))
pl_df = pl_df.sort("created_at")

# count the number of contributions for each contributor
df = (df.groupby("cntrb_id")["Action"].count()).to_frame()

# sort rows according to amount of contributions from greatest to least
df = df.sort_values(by="Action", ascending=False)
# Filter by date range
if start_date is not None:
pl_df = pl_df.filter(pl.col("created_at") >= start_date)
if end_date is not None:
pl_df = pl_df.filter(pl.col("created_at") <= end_date)

df = df.reset_index()
# Filter by action type
pl_df = pl_df.filter(pl.col("Action").str.contains(action_type))

# rename Action column to action_type
df = df.rename(columns={"Action": action_type})
# Count contributions per contributor
pl_grouped = pl_df.group_by("cntrb_id").agg(pl.len().alias(action_type)).sort(action_type, descending=True)

# get the number of total contributions
t_sum = df[action_type].sum()
# Get total sum
t_sum = pl_grouped.select(pl.col(action_type).sum()).item()

# index df to get first k rows
df = df.head(top_k)
# Get top k
pl_top_k = pl_grouped.head(top_k)
df_sum = pl_top_k.select(pl.col(action_type).sum()).item()

# get the number of total top k contributions
df_sum = df[action_type].sum()
# Add "Other" row for remaining contributions
other_row = pl.DataFrame({"cntrb_id": ["Other"], action_type: [t_sum - df_sum]})
pl_result = pl.concat([pl_top_k, other_row])

# calculate the remaining contributions by taking the the difference of t_sum and df_sum
# dataframes no longer implement above 'append' interface as of Pandas 1.4.4
# create a single-entry dataframe that we can concatenate onto existing df
df_concat = pd.DataFrame(data={"cntrb_id": ["Other"], action_type: [t_sum - df_sum]})
df = pd.concat([df, df_concat], ignore_index=True)
# === POLARS PROCESSING END ===

return df
# Convert to Pandas for visualization
return to_pandas(pl_result)


def create_figure(df: pd.DataFrame, action_type):
Expand Down
48 changes: 28 additions & 20 deletions 8Knot/pages/contributors/visualizations/contribs_by_action.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from dash.dependencies import Input, Output, State
import plotly.graph_objects as go
import pandas as pd
import polars as pl
import logging
from dateutil.relativedelta import * # type: ignore
import plotly.express as px
from pages.utils.graph_utils import get_graph_time_values, baby_blue
from pages.utils.polars_utils import to_polars, to_pandas
from queries.contributors_query import contributors_query as ctq
from pages.utils.job_utils import nodata_graph
import time
Expand Down Expand Up @@ -221,32 +223,38 @@ def contribs_by_action_graph(repolist, interval, action, bot_switch):


def process_data(df: pd.DataFrame, interval, action):
# convert to datetime objects rather than strings
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
"""
Process contributors by action data using Polars for performance.

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created_at", axis=0, ascending=True)
Follows the "Polars Core, Pandas Edge" architecture.
"""
# === POLARS PROCESSING START ===

# drop all contributions that are not the selected action
df = df[df["Action"].str.contains(action)]
# Convert to Polars for fast processing
pl_df = to_polars(df)

# For distinct contributors per interval: keep one row per (cntrb_id, interval)
"""df["_period"] = df["created_at"].dt.to_period(interval)
df = df.drop_duplicates(subset=["cntrb_id", "_period"], keep="first")
# Use the start of the interval for plotting consistency
df["created_at"] = df["_period"].dt.start_time
df = df.drop(columns=["_period"]) # cleanup"""
# Convert to datetime and sort
pl_df = pl_df.with_columns(pl.col("created_at").cast(pl.Datetime("us", "UTC")))
pl_df = pl_df.sort("created_at")

freq_map = {"M1": "M", "M3": "Q", "M6": "2Q", "M12": "Y"}
pandas_freq = freq_map.get(interval, interval)
# Filter for selected action using Polars string contains
pl_df = pl_df.filter(pl.col("Action").str.contains(action))

df["_period"] = df["created_at"].dt.to_period(pandas_freq)
df = df.drop_duplicates(subset=["cntrb_id", "_period"], keep="first")
df["created_at"] = df["_period"].dt.start_time
df = df.drop(columns=["_period"])
print(df)
# Map interval to Polars truncation format
interval_map = {"M1": "1mo", "M3": "3mo", "M6": "6mo", "M12": "1y"}
polars_interval = interval_map.get(interval, "1mo")

return df
# Add period column and dedupe per contributor per period
pl_df = pl_df.with_columns(pl.col("created_at").dt.truncate(polars_interval).alias("_period"))
pl_df = pl_df.unique(subset=["cntrb_id", "_period"], keep="first")

# Update created_at to period start time
pl_df = pl_df.with_columns(pl.col("_period").alias("created_at")).drop("_period")

# === POLARS PROCESSING END ===

# Convert to Pandas for visualization
return to_pandas(pl_df)


def create_figure(df: pd.DataFrame, interval, action):
Expand Down
Loading
Loading