diff --git a/8Knot/pages/contributions/contributions.py b/8Knot/pages/contributions/contributions.py index dd87c43e..5942cdfc 100644 --- a/8Knot/pages/contributions/contributions.py +++ b/8Knot/pages/contributions/contributions.py @@ -15,6 +15,7 @@ from .visualizations.cntrb_pr_assignment import gc_cntrib_pr_assignment from .visualizations.pr_first_response import gc_pr_first_response from .visualizations.pr_review_response import gc_pr_review_response +from .visualizations.issue_pr_survival import gc_issue_pr_survival warnings.filterwarnings("ignore") @@ -65,6 +66,7 @@ dbc.Row( [ dbc.Col(gc_pr_review_response, width=6), + dbc.Col(gc_issue_pr_survival, width=6), ], align="center", style={"marginBottom": ".5%"}, diff --git a/8Knot/pages/contributions/visualizations/issue_pr_survival.py b/8Knot/pages/contributions/visualizations/issue_pr_survival.py new file mode 100644 index 00000000..3c4afcb2 --- /dev/null +++ b/8Knot/pages/contributions/visualizations/issue_pr_survival.py @@ -0,0 +1,342 @@ +from dash import html, dcc, callback +import dash +import dash_bootstrap_components as dbc +from dash.dependencies import Input, Output, State +import plotly.graph_objects as go +import pandas as pd +import logging +from dateutil.relativedelta import * # type: ignore +import plotly.express as px +from pages.utils.graph_utils import get_graph_time_values, color_seq +from queries.issues_query import issues_query as iq +from queries.prs_query import prs_query as prq +from queries.pr_response_query import pr_response_query as prr +import io +from cache_manager.cache_manager import CacheManager as cm +import cache_manager.cache_facade as cf +from pages.utils.job_utils import nodata_graph +import time +import app +import datetime as dt +from dateutil.relativedelta import relativedelta + + +PAGE = "contributions" +VIZ_ID = "issue-pr-survival" + +gc_issue_pr_survival = dbc.Card( + [ + dbc.CardBody( + [ + html.H3( + "Issue & Pull Request Survival Analysis", + className="card-title", + style={"textAlign": "center"}, + ), + dbc.Popover( + [ + dbc.PopoverHeader("Graph Info:"), + dbc.PopoverBody( + """ + This visualization displays the survival trends of issues and pull requests + within a repository. It highlights how long these items typically remain open + before they are closed, merged, or addressed (as indicated by the first comment). + """ + ), + ], + id=f"popover-{PAGE}-{VIZ_ID}", + target=f"popover-target-{PAGE}-{VIZ_ID}", + placement="top", + is_open=False, + ), + dcc.Loading( + dcc.Graph(id=f"{PAGE}-{VIZ_ID}"), + ), + dbc.Form( + [ + dbc.Row( + [ + dbc.Label( + "Date Interval:", + html_for=f"date-interval-{PAGE}-{VIZ_ID}", + width="auto", + ), + dbc.Col( + dbc.RadioItems( + id=f"date-interval-{PAGE}-{VIZ_ID}", + options=[ + { + "label": "Day", + "value": "D", + }, + { + "label": "Week", + "value": "W", + }, + {"label": "Month", "value": "M"}, + {"label": "Year", "value": "Y"}, + ], + value="M", + inline=True, + ), + className="me-2", + ), + ], + align="center", + ), + dbc.Row( + [ + dbc.Col( + dcc.DatePickerRange( + id=f"date-picker-range-{PAGE}-{VIZ_ID}", + min_date_allowed=dt.date(2005, 1, 1), + max_date_allowed=dt.date.today(), + initial_visible_month=dt.date(dt.date.today().year, 1, 1), + start_date=dt.date( + dt.date.today().year - 2, + dt.date.today().month, + dt.date.today().day, + ), + clearable=True, + ), + width="auto", + ), + dbc.Col( + dbc.Button( + "About Graph", + id=f"popover-target-{PAGE}-{VIZ_ID}", + color="secondary", + size="sm", + ), + width="auto", + style={"paddingTop": ".5em"}, + ), + ], + align="center", + justify="between", + ), + ] + ), + ] + ), + ], +) +# callback for graph info popover +@callback( + Output(f"popover-{PAGE}-{VIZ_ID}", "is_open"), + [Input(f"popover-target-{PAGE}-{VIZ_ID}", "n_clicks")], + [State(f"popover-{PAGE}-{VIZ_ID}", "is_open")], +) +def toggle_popover(n, is_open): + if n: + return not is_open + return is_open + + +# callback for issue pr survival graph +@callback( + Output(f"{PAGE}-{VIZ_ID}", "figure"), + [ + Input("repo-choices", "data"), + Input(f"date-interval-{PAGE}-{VIZ_ID}", "value"), + Input(f"date-picker-range-{PAGE}-{VIZ_ID}", "start_date"), + Input(f"date-picker-range-{PAGE}-{VIZ_ID}", "end_date"), + Input("bot-switch", "value"), + ], + background=True, +) +def issue_pr_survival_graph(repolist, interval, start_date, end_date, bot_switch): + # wait for data to asynchronously download and become available. + for query_func in [iq, prq]: # Assuming i, pr, and prr are your query functions + while not_cached := cf.get_uncached(func_name=query_func.__name__, repolist=repolist): + logging.warning(f"{VIZ_ID}- WAITING ON DATA TO BECOME AVAILABLE") + time.sleep(0.5) + + logging.warning(f"{VIZ_ID} - START") + start = time.perf_counter() + + # GET ALL DATA FROM POSTGRES CACHE + issues_df = cf.retrieve_from_cache( + tablename=iq.__name__, + repolist=repolist, + ) + + prs_df = cf.retrieve_from_cache( + tablename=prq.__name__, + repolist=repolist, + ) + + pr_response_df = cf.retrieve_from_cache( + tablename=prr.__name__, + repolist=repolist, + ) + + if issues_df.empty and prs_df.empty: + logging.warning(f"{VIZ_ID} - NO DATA AVAILABLE") + return nodata_graph + + # remove bot data (for issue and pr first response data) + if bot_switch: + pr_response_df = pr_response_df[~pr_response_df["cntrb_id"].isin(app.bots_list)] + pr_response_df = pr_response_df[~pr_response_df["msg_cntrb_id"].isin(app.bots_list)] + + # function for all data pre processing + df = process_data(issues_df, prs_df, pr_response_df, interval, start_date, end_date) + + fig = create_figure(df) + + logging.warning(f"{VIZ_ID} - END - {time.perf_counter() - start}") + return fig + + +def process_data( + issues_df: pd.DataFrame, prs_df: pd.DataFrame, pr_response_df: pd.DataFrame, interval, start_date, end_date +): + # convert to datetime objects + issues_df["created_at"] = pd.to_datetime(issues_df["created_at"], utc=False) + issues_df["closed_at"] = pd.to_datetime(issues_df["closed_at"], utc=False) + + prs_df["created_at"] = pd.to_datetime(prs_df["created_at"], utc=False) + prs_df["closed_at"] = pd.to_datetime(prs_df["closed_at"], utc=False) + prs_df["merged_at"] = pd.to_datetime(prs_df["merged_at"], utc=False) + + pr_response_df["pr_created_at"] = pd.to_datetime(pr_response_df["pr_created_at"], utc=False) + + # drop messages from the pr creator + pr_response_df = pr_response_df[pr_response_df["cntrb_id"] != pr_response_df["msg_cntrb_id"]] + + # sort in ascending earlier and only get ealiest value + pr_response_df = pr_response_df.sort_values(by="msg_timestamp", axis=0, ascending=True) + pr_response_df = pr_response_df.drop_duplicates(subset="pull_request_id", keep="first") + + # find earliest and latest events + earliest = min( + issues_df["created_at"].min(), + prs_df["created_at"].min(), + pr_response_df["pr_created_at"].min(), + ) + latest = max( + issues_df["closed_at"].max(), + prs_df["closed_at"].max(), + prs_df["merged_at"].max(), + pr_response_df["msg_timestamp"].max(), + ) + + # filter values based on date picker + if start_date is not None: + issues_df = issues_df[issues_df["created_at"] >= start_date] + prs_df = prs_df[prs_df["created_at"] >= start_date] + pr_response_df = pr_response_df[pr_response_df["pr_created_at"] >= start_date] + earliest = start_date + if end_date is not None: + issues_df = issues_df[issues_df["closed_at"] <= end_date] + prs_df = prs_df[prs_df["closed_at"] <= end_date] + prs_df = prs_df[prs_df["merged_at"] <= end_date] + pr_response_df = pr_response_df[pr_response_df["msg_timestamp"] <= end_date] + latest = end_date + + # create date range by specified interval + dates = pd.date_range(start=earliest, end=latest, freq=interval, inclusive="both") + + # df for survival analysis + df_survival = dates.to_frame(index=False, name="Date") + + # calculate survival probabilities + df_survival["issue_closed_survival"] = df_survival["Date"].apply( + lambda date: ( + ( + issues_df[issues_df["created_at"] <= date].shape[0] + - issues_df[(issues_df["created_at"] <= date) & (issues_df["closed_at"].notnull())].shape[0] + ) + / issues_df[issues_df["created_at"] <= date].shape[0] + if issues_df[issues_df["created_at"] <= date].shape[0] > 0 + else 1 + ) + ) + df_survival["pr_merged_survival"] = df_survival["Date"].apply( + lambda date: ( + ( + prs_df[prs_df["created_at"] <= date].shape[0] + - prs_df[(prs_df["created_at"] <= date) & (prs_df["merged_at"].notnull())].shape[0] + ) + / prs_df[prs_df["created_at"] <= date].shape[0] + if prs_df[prs_df["created_at"] <= date].shape[0] > 0 + else 1 + ) + ) + df_survival["pr_closed_survival"] = df_survival["Date"].apply( + lambda date: ( + ( + prs_df[prs_df["created_at"] <= date].shape[0] + - prs_df[(prs_df["created_at"] <= date) & (prs_df["closed_at"].notnull())].shape[0] + ) + / prs_df[prs_df["created_at"] <= date].shape[0] + if prs_df[prs_df["created_at"] <= date].shape[0] > 0 + else 1 + ) + ) + df_survival["pr_to_first_comment_survival"] = df_survival["Date"].apply( + lambda date: ( + ( + pr_response_df[pr_response_df["pr_created_at"] <= date].shape[0] + - pr_response_df[ + (pr_response_df["pr_created_at"] <= date) & (pr_response_df["msg_timestamp"].notnull()) + ].shape[0] + ) + / pr_response_df[pr_response_df["pr_created_at"] <= date].shape[0] + if pr_response_df[pr_response_df["pr_created_at"] <= date].shape[0] > 0 + else 1 + ) + ) + return df_survival + + +def create_figure(df: pd.DataFrame): + fig = go.Figure( + [ + go.Scatter( + name="Issue Closed", + x=df["Date"], + y=df["issue_closed_survival"], + mode="lines", + showlegend=True, + hovertemplate=("Survival Probability: %{y:.2f}
%{x|%b %d, %Y} "), + marker=dict(color=color_seq[0]), + ), + go.Scatter( + name="PR Merged", + x=df["Date"], + y=df["pr_merged_survival"], + mode="lines", + showlegend=True, + hovertemplate="Survival Probability: %{y:.2f}
%{x|%b %d, %Y} ", + marker=dict(color=color_seq[1]), + ), + go.Scatter( + name="PR Closed", + x=df["Date"], + y=df["pr_closed_survival"], + mode="lines", + showlegend=True, + hovertemplate="Survival Probability: %{y:.2f}
%{x|%b %d, %Y} ", + marker=dict(color=color_seq[2]), + ), + go.Scatter( + name="PR to First Comment", + x=df["Date"], + y=df["pr_to_first_comment_survival"], + mode="lines", + showlegend=True, + hovertemplate="Survival Probability: %{y:.2f}
%{x|%b %d, %Y} ", + marker=dict(color=color_seq[3]), + ), + ] + ) + + fig.update_layout( + xaxis_title="Time", + yaxis_title="Survival Probability", + font=dict(size=14), + ) + + return fig