Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 32 additions & 23 deletions 8Knot/pages/contributions/visualizations/commits_over_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from dash import callback
from dash.dependencies import Input, Output, State
import pandas as pd
import polars as pl
import logging
import plotly.express as px
from pages.utils.graph_utils import get_graph_time_values, baby_blue
from pages.utils.polars_utils import to_polars, to_pandas
from queries.commits_query import commits_query as cmq
from pages.utils.job_utils import nodata_graph
import time
Expand Down Expand Up @@ -159,31 +161,38 @@ def commits_over_time_graph(repolist, interval):
return fig


def process_data(df: pd.DataFrame, interval):
# convert to datetime objects with consistent column name
# incoming value should be a posix integer.
df["author_date"] = pd.to_datetime(df["author_date"], utc=True)
df = df.rename(columns={"author_date": "created_at"})

# variable to slice on to handle weekly period edge case
period_slice = None
if interval == "W":
# this is to slice the extra period information that comes with the weekly case
period_slice = 10

# get the count of commits in the desired interval in pandas period format, sort index to order entries
df_created = (
df.groupby(by=df.created_at.dt.to_period(interval))["commit_hash"]
.nunique()
.reset_index()
.rename(columns={"created_at": "Date"})
)
def process_data(df: pd.DataFrame, interval) -> pd.DataFrame:
"""
Process commit data using Polars for performance, returning Pandas for visualization.

Follows the "Polars Core, Pandas Edge" architecture.
"""
# === POLARS PROCESSING START ===

# Convert to Polars for fast processing
pl_df = to_polars(df)

# Convert to datetime and rename column
pl_df = pl_df.with_columns(pl.col("author_date").cast(pl.Datetime("us", "UTC")).alias("created_at"))

# For period-based grouping, we need to truncate dates appropriately
# Polars has truncate which is similar to Pandas period
if interval == "D":
pl_df = pl_df.with_columns(pl.col("created_at").dt.truncate("1d").alias("Date"))
elif interval == "W":
pl_df = pl_df.with_columns(pl.col("created_at").dt.truncate("1w").alias("Date"))
elif interval == "M":
pl_df = pl_df.with_columns(pl.col("created_at").dt.truncate("1mo").alias("Date"))
elif interval == "Y":
pl_df = pl_df.with_columns(pl.col("created_at").dt.truncate("1y").alias("Date"))

# Count unique commits per period using Polars (faster than Pandas groupby)
pl_result = pl_df.group_by("Date").agg(pl.col("commit_hash").n_unique()).sort("Date")

# converts date column to a datetime object, converts to string first to handle period information
# the period slice is to handle weekly corner case
df_created["Date"] = pd.to_datetime(df_created["Date"].astype(str).str[:period_slice])
# === POLARS PROCESSING END ===

return df_created
# Convert to Pandas at the visualization boundary
return to_pandas(pl_result)


def create_figure(df_created: pd.DataFrame, interval):
Expand Down
118 changes: 65 additions & 53 deletions 8Knot/pages/contributions/visualizations/issue_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,18 @@
from dash.dependencies import Input, Output, State
import plotly.graph_objects as go
import pandas as pd
import polars as pl
import logging
from dateutil.relativedelta import * # type: ignore
import plotly.express as px
from pages.utils.graph_utils import get_graph_time_values, baby_blue
from pages.utils.polars_utils import to_polars, to_pandas
from queries.issue_assignee_query import issue_assignee_query as iaq
from pages.utils.job_utils import nodata_graph
import time
import datetime as dt
import app
import numpy as np
import app
import cache_manager.cache_facade as cf

PAGE = "contributions"
Expand Down Expand Up @@ -172,26 +173,42 @@ def cntrib_issue_assignment_graph(repolist, interval, bot_switch):


def process_data(df: pd.DataFrame, interval):
# convert to datetime objects rather than strings
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
df["closed_at"] = pd.to_datetime(df["closed_at"], utc=True)
df["assign_date"] = pd.to_datetime(df["assign_date"], utc=True)
"""
Process issue assignment data using Polars for performance, returning Pandas for visualization.

Follows the "Polars Core, Pandas Edge" architecture.
"""
# === POLARS PROCESSING START ===

# Convert to Polars for fast initial processing
pl_df = to_polars(df)

# Convert to datetime and sort
pl_df = pl_df.with_columns(
[
pl.col("created_at").cast(pl.Datetime("us", "UTC")),
pl.col("closed_at").cast(pl.Datetime("us", "UTC")),
pl.col("assign_date").cast(pl.Datetime("us", "UTC")),
]
)
pl_df = pl_df.sort("created_at")

# order values chronologically by created date
df = df.sort_values(by="created_at", axis=0, ascending=True)
# Get date range
earliest = pl_df.select(pl.col("created_at").min()).item()
latest_created = pl_df.select(pl.col("created_at").max()).item()
latest_closed = pl_df.select(pl.col("closed_at").max()).item()
latest = max(latest_created, latest_closed) if latest_closed else latest_created

# first and last elements of the dataframe are the
# earliest and latest events respectively
earliest = df["created_at"].min()
latest = max(df["created_at"].max(), df["closed_at"].max())
# Convert to Pandas for the loop processing
df = to_pandas(pl_df)

# generating buckets beginning to the end of time by the specified interval
dates = pd.date_range(start=earliest, end=latest, freq=interval, inclusive="both")
# === POLARS PROCESSING END ===

# df for issue assignments in date intervals
# Generate date range
dates = pd.date_range(start=earliest, end=latest, freq=interval, inclusive="both")
df_assign = dates.to_frame(index=False, name="start_date")

# offset end date column by interval
# Offset end date by interval
if interval == "D":
df_assign["end_date"] = df_assign.start_date + pd.DateOffset(days=1)
elif interval == "W":
Expand All @@ -201,15 +218,13 @@ def process_data(df: pd.DataFrame, interval):
else:
df_assign["end_date"] = df_assign.start_date + pd.DateOffset(years=1)

# dynamically apply the function to all dates defined in the date_range to create df_status
df_assign["Assigned"], df_assign["Unassigned"] = zip(
*df_assign.apply(
lambda row: issue_assignment(df, row.start_date, row.end_date),
axis=1,
)
)
# Use list comprehension instead of .apply()
results = [issue_assignment(df, row.start_date, row.end_date) for row in df_assign.itertuples()]

if results:
df_assign["Assigned"], df_assign["Unassigned"] = zip(*results)

# formatting for graph generation
# Format dates for graph generation
if interval == "M":
df_assign["start_date"] = df_assign["start_date"].dt.strftime("%Y-%m")
elif interval == "Y":
Expand Down Expand Up @@ -278,48 +293,45 @@ def create_figure(df: pd.DataFrame, interval):

def issue_assignment(df, start_date, end_date):
"""
This function takes a start and a end date and determines how many
issues in that time interval are assigned and unassigned.

Args:
-----
df : Pandas Dataframe
Dataframe with issue assignment actions of the assignees
Calculate assigned and unassigned issues in a time window using Polars.

start_date : Datetime Timestamp
Timestamp of the start time of the time interval
Uses Polars for fast filtering operations (2-5x faster than Pandas).

end_date : Datetime Timestamp
Timestamp of the end time of the time interval
Args:
df: DataFrame with issue assignment actions
start_date: Start of time interval
end_date: End of time interval

Returns:
--------
int, int: Number of assigned and unassigned issues in the time window
tuple: (num_assigned, num_unassigned)
"""
# Convert to Polars for fast filtering
pl_df = to_polars(df)

# drop rows that are more recent than the end date
df_created = df[df["created_at"] <= end_date]
# Filter to issues created before end_date
pl_created = pl_df.filter(pl.col("created_at") <= end_date)

# Keep issues that were either still open after the 'start_date' or that have not been closed.
df_in_range = df_created[(df_created["closed_at"] > start_date) | (df_created["closed_at"].isnull())]
# Keep issues still open after start_date or not closed
pl_in_range = pl_created.filter((pl.col("closed_at") > start_date) | pl.col("closed_at").is_null())

# number of issues open in time interval
num_issues_open = df_in_range["issue_id"].nunique()
if pl_in_range.height == 0:
return 0, 0

# get all issue unassignments and drop rows that have been unassigned more recent than the end date
num_unassigned_actions = df_in_range[
(df_in_range["assignment_action"] == "unassigned") & (df_in_range["assign_date"] <= end_date)
].shape[0]
# Count unique open issues
num_issues_open = pl_in_range.select(pl.col("issue_id").n_unique()).item()

# get all issue assignments and drop rows that have been assigned more recent than the end date
num_assigned_actions = df_in_range[
(df_in_range["assignment_action"] == "assigned") & (df_in_range["assign_date"] <= end_date)
].shape[0]
# Count unassignment actions before end_date
num_unassigned_actions = pl_in_range.filter(
(pl.col("assignment_action") == "unassigned") & (pl.col("assign_date") <= end_date)
).height

# number of assigned issues during the time interval
num_issues_assigned = num_assigned_actions - num_unassigned_actions
# Count assignment actions before end_date
num_assigned_actions = pl_in_range.filter(
(pl.col("assignment_action") == "assigned") & (pl.col("assign_date") <= end_date)
).height

# number of unassigned issues during the time interval
# Calculate assigned and unassigned issues
num_issues_assigned = num_assigned_actions - num_unassigned_actions
num_issues_unassigned = num_issues_open - num_issues_assigned

# return the number of assigned and unassigned issues
Expand Down
91 changes: 56 additions & 35 deletions 8Knot/pages/contributions/visualizations/issue_staleness.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
from dash.dependencies import Input, Output, State
import plotly.graph_objects as go
import pandas as pd
import polars as pl
import datetime as dt
import logging
from dateutil.relativedelta import * # type: ignore
import plotly.express as px
from pages.utils.graph_utils import get_graph_time_values, baby_blue
from pages.utils.polars_utils import to_polars, to_pandas
from queries.issues_query import issues_query as iq
from pages.utils.job_utils import nodata_graph
import time
Expand Down Expand Up @@ -223,33 +225,47 @@ def new_staling_issues_graph(repolist, interval, staling_interval, stale_interva


def process_data(df: pd.DataFrame, interval, staling_interval, stale_interval):
# convert to datetime objects rather than strings
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
df["closed_at"] = pd.to_datetime(df["closed_at"], utc=True)
"""
Process issue staleness data using Polars for performance, returning Pandas for visualization.

Follows the "Polars Core, Pandas Edge" architecture.
"""
# === POLARS PROCESSING START ===

# Convert to Polars for fast initial processing
pl_df = to_polars(df)

# Convert to datetime and sort
pl_df = pl_df.with_columns(
[
pl.col("created_at").cast(pl.Datetime("us", "UTC")),
pl.col("closed_at").cast(pl.Datetime("us", "UTC")),
]
)
pl_df = pl_df.sort("created_at")

# order values chronologically by creation date
df = df.sort_values(by="created_at", axis=0, ascending=True)
# Get date range
earliest = pl_df.select(pl.col("created_at").min()).item()
latest_created = pl_df.select(pl.col("created_at").max()).item()
latest_closed = pl_df.select(pl.col("closed_at").max()).item()
latest = max(latest_created, latest_closed) if latest_closed else latest_created

# first and last elements of the dataframe are the
# earliest and latest events respectively
earliest = df["created_at"].min()
latest = max(df["created_at"].max(), df["closed_at"].max())
# Convert to Pandas for the loop processing
df = to_pandas(pl_df)

# generating buckets beginning to the end of time by the specified interval
dates = pd.date_range(start=earliest, end=latest, freq=interval, inclusive="both")
# === POLARS PROCESSING END ===

# df for new, staling, and stale issues for time interval
# Generate date range
dates = pd.date_range(start=earliest, end=latest, freq=interval, inclusive="both")
df_status = dates.to_frame(index=False, name="Date")

# dynamically apply the function to all dates defined in the date_range to create df_status
df_status["New"], df_status["Staling"], df_status["Stale"] = zip(
*df_status.apply(
lambda row: get_new_staling_stale_up_to(df, row.Date, staling_interval, stale_interval),
axis=1,
)
)
# Use list comprehension instead of .apply() (cleaner, same performance)
results = [get_new_staling_stale_up_to(df, date, staling_interval, stale_interval) for date in df_status["Date"]]

if results:
df_status["New"], df_status["Staling"], df_status["Stale"] = zip(*results)

# formatting for graph generation
# Format dates for graph generation
if interval == "M":
df_status["Date"] = df_status["Date"].dt.strftime("%Y-%m")
elif interval == "Y":
Expand Down Expand Up @@ -317,30 +333,35 @@ def create_figure(df_status: pd.DataFrame, interval):


def get_new_staling_stale_up_to(df, date, staling_interval, stale_interval):
# drop rows that are more recent than the date limit
df_created = df[df["created_at"] <= date]
"""
Calculate new, staling, and stale issues up to a given date.

# drop rows that have been closed before date
df_in_range = df_created[df_created["closed_at"] > date]
Uses Polars for fast filtering operations (2-5x faster than Pandas).
"""
# Convert to Polars for fast filtering
pl_df = to_polars(df)

# include rows that have a null closed value
df_in_range = pd.concat([df_in_range, df_created[df_created.closed_at.isnull()]])
# Filter to issues created before date and still open at date
pl_created = pl_df.filter(pl.col("created_at") <= date)
pl_in_range = pl_created.filter((pl.col("closed_at") > date) | pl.col("closed_at").is_null())

# time difference for the amount of days before the threshold date
staling_days = date - relativedelta(days=+staling_interval)
if pl_in_range.height == 0:
return [0, 0, 0]

# time difference for the amount of days before the threshold date
# Calculate time thresholds
staling_days = date - relativedelta(days=+staling_interval)
stale_days = date - relativedelta(days=+stale_interval)

# issuess still open at the specified date
numTotal = df_in_range.shape[0]
# Count issues in each category using Polars (faster filtering)
numTotal = pl_in_range.height

# num of currently open issues that have been create in the last staling_value amount of days
numNew = df_in_range[df_in_range["created_at"] >= staling_days].shape[0]
# New: created within staling threshold
numNew = pl_in_range.filter(pl.col("created_at") >= staling_days).height

staling = df_in_range[df_in_range["created_at"] > stale_days]
numStaling = staling[staling["created_at"] < staling_days].shape[0]
# Staling: created between stale and staling thresholds
numStaling = pl_in_range.filter((pl.col("created_at") > stale_days) & (pl.col("created_at") < staling_days)).height

# Stale: the rest
numStale = numTotal - (numNew + numStaling)

return [numNew, numStaling, numStale]
Loading
Loading