Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
ed6209a
removal of duplicates/similar CSS. Prioritizing global variables.
EngCaioFonseca Oct 7, 2025
2b63fce
Replacing spacing for the fontsize
EngCaioFonseca Oct 13, 2025
1bd6b18
refactor: Fix Pandas anti-patterns (Phase 0 of Polars migration)
EngCaioFonseca Dec 13, 2025
dcdbf28
feat: Add Polars and convert first module (Phase 1 & 2)
EngCaioFonseca Dec 13, 2025
6e3e260
feat: Convert code_languages.py and ossf_scorecard.py to Polars
EngCaioFonseca Dec 13, 2025
923363a
feat: Phase 3 - Query layer Polars support + benchmarks + more conver…
EngCaioFonseca Dec 13, 2025
36b7f98
feat: Convert 4 more visualization modules to Polars
EngCaioFonseca Dec 13, 2025
9e68b85
feat: Convert 4 more visualization modules to Polars
EngCaioFonseca Dec 13, 2025
59c368f
feat: Enhance polars_utils.py + convert 3 more contributor modules
EngCaioFonseca Dec 13, 2025
df361f9
feat: Convert 4 more contributor visualizations to Polars
EngCaioFonseca Dec 13, 2025
0a320dc
feat: Convert CHAOSS contrib_importance_pie.py to Polars
EngCaioFonseca Dec 13, 2025
747511c
feat: Convert pr_review_response.py to Polars
EngCaioFonseca Dec 13, 2025
79cadc8
feat: Convert 4 more visualization modules to Polars
EngCaioFonseca Dec 13, 2025
2c4af31
feat: Convert 3 more modules to Polars (affiliation + CHAOSS)
EngCaioFonseca Dec 13, 2025
245df8a
feat: Add Polars imports to heatmap modules
EngCaioFonseca Dec 13, 2025
bdd6260
docs: Update POLARS_MIGRATION_PLAN.md with final status
EngCaioFonseca Dec 13, 2025
ce97469
docs: Add comprehensive code quality evaluation and PR description
EngCaioFonseca Dec 19, 2025
4923421
chore: Update uv.lock to include polars dependency
EngCaioFonseca Dec 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
973 changes: 355 additions & 618 deletions 8Knot/assets/landing_page.css

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions 8Knot/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Benchmarks module for performance testing
257 changes: 257 additions & 0 deletions 8Knot/benchmarks/polars_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
"""
Performance Benchmarks for Polars Migration

This script measures performance improvements from the Polars migration.
Run with: python -m benchmarks.polars_benchmark

Benchmarks:
1. DataFrame creation: Pandas vs Polars from raw data
2. Common operations: groupby, filter, sort
3. The specific anti-patterns we fixed
"""

import time
import numpy as np
import pandas as pd
import polars as pl
from typing import Callable
from dataclasses import dataclass


@dataclass
class BenchmarkResult:
"""Result of a benchmark comparison."""

name: str
pandas_time: float
polars_time: float

@property
def speedup(self) -> float:
"""Calculate speedup factor (higher is better for Polars)."""
if self.polars_time == 0:
return float("inf")
return self.pandas_time / self.polars_time

def __str__(self) -> str:
return (
f"{self.name}:\n"
f" Pandas: {self.pandas_time:.4f}s\n"
f" Polars: {self.polars_time:.4f}s\n"
f" Speedup: {self.speedup:.2f}x"
)


def time_function(func: Callable, n_runs: int = 3) -> float:
"""Time a function, returning the average of n_runs."""
times = []
for _ in range(n_runs):
start = time.perf_counter()
func()
times.append(time.perf_counter() - start)
return sum(times) / len(times)


def generate_test_data(n_rows: int = 100_000) -> dict:
"""Generate test data for benchmarks."""
np.random.seed(42)
return {
"id": np.arange(n_rows),
"category": np.random.choice(["A", "B", "C", "D", "E"], n_rows),
"value": np.random.randn(n_rows) * 100,
"count": np.random.randint(1, 100, n_rows),
"created_at": pd.date_range("2020-01-01", periods=n_rows, freq="T"),
"closed_at": pd.date_range("2020-01-01", periods=n_rows, freq="T")
+ pd.to_timedelta(np.random.randint(0, 30, n_rows), unit="D"),
}


def benchmark_dataframe_creation(data: dict) -> BenchmarkResult:
"""Benchmark DataFrame creation."""

def pandas_create():
pd.DataFrame(data)

def polars_create():
pl.DataFrame(data)

return BenchmarkResult(
name="DataFrame Creation",
pandas_time=time_function(pandas_create),
polars_time=time_function(polars_create),
)


def benchmark_groupby_agg(pd_df: pd.DataFrame, pl_df: pl.DataFrame) -> BenchmarkResult:
"""Benchmark groupby aggregation."""

def pandas_groupby():
pd_df.groupby("category").agg({"value": "sum", "count": "mean"})

def polars_groupby():
pl_df.group_by("category").agg([pl.col("value").sum(), pl.col("count").mean()])

return BenchmarkResult(
name="GroupBy Aggregation",
pandas_time=time_function(pandas_groupby),
polars_time=time_function(polars_groupby),
)


def benchmark_filter_sort(pd_df: pd.DataFrame, pl_df: pl.DataFrame) -> BenchmarkResult:
"""Benchmark filtering and sorting."""

def pandas_filter_sort():
df = pd_df[pd_df["value"] > 0]
df.sort_values("count", ascending=False)

def polars_filter_sort():
pl_df.filter(pl.col("value") > 0).sort("count", descending=True)

return BenchmarkResult(
name="Filter + Sort",
pandas_time=time_function(pandas_filter_sort),
polars_time=time_function(polars_filter_sort),
)


def benchmark_conditional_column(pd_df: pd.DataFrame, pl_df: pl.DataFrame) -> BenchmarkResult:
"""Benchmark conditional column creation (like code_languages.py)."""

def pandas_conditional():
df = pd_df.copy()
df.loc[df["category"] == "A", "value"] = df["count"]

def polars_conditional():
pl_df.with_columns(
pl.when(pl.col("category") == "A").then(pl.col("count")).otherwise(pl.col("value")).alias("value")
)

return BenchmarkResult(
name="Conditional Column (when/then)",
pandas_time=time_function(pandas_conditional),
polars_time=time_function(polars_conditional),
)


def benchmark_vectorized_log(pd_df: pd.DataFrame, pl_df: pl.DataFrame) -> BenchmarkResult:
"""Benchmark vectorized log (like project_velocity.py fix)."""

def pandas_log():
# Old anti-pattern: df["value"].apply(lambda x: math.log(x) if x > 0 else 0)
# New vectorized:
np.where(pd_df["value"] > 0, np.log(pd_df["value"].abs()), 0)

def polars_log():
pl_df.select(pl.when(pl.col("value") > 0).then(pl.col("value").abs().log()).otherwise(0).alias("log_value"))

return BenchmarkResult(
name="Vectorized Log (anti-pattern fix)",
pandas_time=time_function(pandas_log),
polars_time=time_function(polars_log),
)


def benchmark_cumsum_threshold(pd_df: pd.DataFrame, pl_df: pl.DataFrame) -> BenchmarkResult:
"""Benchmark cumsum + threshold finding (like lottery factor fix)."""
threshold = pd_df["count"].sum() * 0.5

def pandas_cumsum():
cumsum = pd_df["count"].cumsum()
np.searchsorted(cumsum.values, threshold, side="left")

def polars_cumsum():
cumsum = pl_df.select(pl.col("count").cum_sum())
# Polars doesn't have searchsorted, but we can filter
cumsum.filter(pl.col("count") >= threshold).head(1)

return BenchmarkResult(
name="Cumsum + Threshold (lottery factor)",
pandas_time=time_function(pandas_cumsum),
polars_time=time_function(polars_cumsum),
)


def benchmark_open_count_vectorized(pd_df: pd.DataFrame, pl_df: pl.DataFrame) -> BenchmarkResult:
"""Benchmark open item counting (like issues_over_time.py fix)."""

# Create date range for testing
dates = pd.date_range("2020-01-15", periods=100, freq="D")

def pandas_open_count():
# The vectorized approach we implemented
created = pd_df["created_at"].values
closed = pd_df["closed_at"].values
for date in dates[:10]: # Sample 10 dates
created_mask = created <= date
still_open_mask = pd.isna(closed) | (closed > date)
np.sum(created_mask & still_open_mask)

def polars_open_count():
# Polars approach
for date in dates[:10]: # Sample 10 dates
pl_df.filter(
(pl.col("created_at") <= date) & (pl.col("closed_at").is_null() | (pl.col("closed_at") > date))
).height

return BenchmarkResult(
name="Open Items Count (vectorized)",
pandas_time=time_function(pandas_open_count),
polars_time=time_function(polars_open_count),
)


def run_all_benchmarks():
"""Run all benchmarks and print results."""
print("=" * 60)
print("POLARS MIGRATION PERFORMANCE BENCHMARKS")
print("=" * 60)
print()

# Generate test data
print("Generating test data (100,000 rows)...")
data = generate_test_data(100_000)
pd_df = pd.DataFrame(data)
pl_df = pl.DataFrame(data)
print()

# Run benchmarks
results = [
benchmark_dataframe_creation(data),
benchmark_groupby_agg(pd_df, pl_df),
benchmark_filter_sort(pd_df, pl_df),
benchmark_conditional_column(pd_df, pl_df),
benchmark_vectorized_log(pd_df, pl_df),
benchmark_cumsum_threshold(pd_df, pl_df),
benchmark_open_count_vectorized(pd_df, pl_df),
]

# Print results
print("-" * 60)
print("RESULTS")
print("-" * 60)
for result in results:
print(result)
print()

# Summary
print("=" * 60)
print("SUMMARY")
print("=" * 60)
avg_speedup = sum(r.speedup for r in results) / len(results)
max_speedup = max(results, key=lambda r: r.speedup)
print(f"Average Speedup: {avg_speedup:.2f}x")
print(f"Best Speedup: {max_speedup.name} ({max_speedup.speedup:.2f}x)")
print()
print("Recommendations:")
for result in results:
if result.speedup > 2:
print(f" ✅ {result.name}: {result.speedup:.2f}x faster with Polars")
elif result.speedup > 1:
print(f" ⚡ {result.name}: {result.speedup:.2f}x faster with Polars")
else:
print(f" ⚠️ {result.name}: Pandas faster ({1/result.speedup:.2f}x)")


if __name__ == "__main__":
run_all_benchmarks()
60 changes: 52 additions & 8 deletions 8Knot/cache_manager/cache_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from psycopg2.extras import execute_values
from psycopg2 import sql as pg_sql
import pandas as pd
import polars as pl
from typing import Literal, Union

# requires relative import syntax "import .cx_common" because
# other files importing cache_facade need to know how to resolve
Expand Down Expand Up @@ -202,17 +204,26 @@ def caching_wrapper(func_name: str, query: str, repolist: list[int], n_repolist_
def retrieve_from_cache(
tablename: str,
repolist: list[int],
) -> pd.DataFrame:
as_polars: bool = False,
) -> Union[pd.DataFrame, pl.DataFrame]:
"""
For a given table in cache, get all results
that having a matching repo_id.

Results are retrieved by a DataFrame, so column names
may need to be overridden by calling function.

Args:
tablename: Name of the cache table
repolist: List of repo IDs to retrieve
as_polars: If True, return a Polars DataFrame (faster for processing).
If False (default), return a Pandas DataFrame (for backward compatibility).

Returns:
DataFrame with cached results (Polars or Pandas based on as_polars flag)
"""

# GET ALL DATA FROM POSTGRES CACHE
df = None
with pg.connect(cache_cx_string) as cache_conn:
with cache_conn.cursor() as cache_cur:
cache_cur.execute(
Expand All @@ -227,10 +238,43 @@ def retrieve_from_cache(
)

logging.warning(f"{tablename} - LOADING DATA FROM CACHE")
df = pd.DataFrame(
cache_cur.fetchall(),
# get df column names from the database columns
columns=[desc[0] for desc in cache_cur.description],
)
logging.warning(f"{tablename} - DATA LOADED - {df.shape} rows,cols")

# Get column names from cursor description
columns = [desc[0] for desc in cache_cur.description]
rows = cache_cur.fetchall()

if as_polars:
# Create Polars DataFrame directly (faster for processing)
df = pl.DataFrame(rows, schema=columns, orient="row")
logging.warning(f"{tablename} - DATA LOADED AS POLARS - {df.shape} rows,cols")
else:
# Create Pandas DataFrame (backward compatible)
df = pd.DataFrame(rows, columns=columns)
logging.warning(f"{tablename} - DATA LOADED AS PANDAS - {df.shape} rows,cols")

return df


def retrieve_from_cache_polars(
tablename: str,
repolist: list[int],
) -> pl.DataFrame:
"""
Retrieve cached data as a Polars DataFrame for high-performance processing.

This is a convenience function that wraps retrieve_from_cache with as_polars=True.
Use this when you need fast data processing (2-10x faster than Pandas).

For visualization, convert to Pandas at the boundary:
pl_df = retrieve_from_cache_polars(...)
# ... Polars processing ...
pd_df = pl_df.to_pandas() # For Plotly/Dash

Args:
tablename: Name of the cache table
repolist: List of repo IDs to retrieve

Returns:
Polars DataFrame with cached results
"""
return retrieve_from_cache(tablename, repolist, as_polars=True)
5 changes: 2 additions & 3 deletions 8Knot/db_manager/augur_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,7 @@ def run_query(self, query_string: str) -> pd.DataFrame:
except:
raise Exception("DB Read Failure")

result_df = result_df.reset_index()
result_df.drop("index", axis=1, inplace=True)
result_df = result_df.reset_index(drop=True)

return result_df

Expand Down Expand Up @@ -200,7 +199,7 @@ def multiselect_startup(self):
# used when the user selects an org
# Output is of the form: {group_name: [rid1, rid2, ...], group_name: [...], ...}
df_lower_repo_names = df_search_bar.copy()
df_lower_repo_names["rg_name"] = df_lower_repo_names["rg_name"].apply(str.lower)
df_lower_repo_names["rg_name"] = df_lower_repo_names["rg_name"].str.lower()
self.org_name_to_repos_dict = df_lower_repo_names.groupby("rg_name")["repo_id"].apply(list).to_dict()
self.org_names = list(self.org_name_to_repos_dict.keys())

Expand Down
Loading
Loading