diff --git a/notebooks/01-blob-inclusion.ipynb b/notebooks/01-blob-inclusion.ipynb
index 68fca04..92201df 100644
--- a/notebooks/01-blob-inclusion.ipynb
+++ b/notebooks/01-blob-inclusion.ipynb
@@ -20,6 +20,7 @@
"import altair as alt\n",
"import numpy as np\n",
"import pandas as pd\n",
+ "import polars as pl\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"\n",
@@ -118,13 +119,16 @@
"metadata": {},
"outputs": [],
"source": [
- "df_blocks_blob_epoch = load_parquet(\"blocks_blob_epoch\", target_date)\n",
+ "df_blocks_blob_epoch_pd = load_parquet(\"blocks_blob_epoch\", target_date)\n",
+ "df_blocks_blob_epoch = pl.from_pandas(df_blocks_blob_epoch_pd)\n",
"\n",
"# Format blob count as \"XX blobs\" for display (moved from SQL for cleaner queries)\n",
- "df_blocks_blob_epoch[\"series\"] = df_blocks_blob_epoch[\"blob_count\"].apply(lambda x: f\"{int(x):02d} blobs\")\n",
+ "df_blocks_blob_epoch = df_blocks_blob_epoch.with_columns(\n",
+ " pl.col(\"blob_count\").cast(pl.Int64).map_elements(lambda x: f\"{x:02d} blobs\", return_dtype=pl.Utf8).alias(\"series\")\n",
+ ")\n",
"\n",
"chart = (\n",
- " alt.Chart(df_blocks_blob_epoch)\n",
+ " alt.Chart(df_blocks_blob_epoch.to_pandas())\n",
" .mark_bar()\n",
" .encode(\n",
" x=alt.X(\"time:T\"),\n",
@@ -177,23 +181,32 @@
"metadata": {},
"outputs": [],
"source": [
- "df_blob_popularity = load_parquet(\"blob_popularity\", target_date)\n",
+ "df_blob_popularity_pd = load_parquet(\"blob_popularity\", target_date)\n",
+ "df_blob_popularity = pl.from_pandas(df_blob_popularity_pd)\n",
"\n",
"# Pivot for heatmap\n",
- "df_pivot = df_blob_popularity.pivot(index=\"blob_count\", columns=\"time\", values=\"count\").fillna(0)\n",
+ "df_pivot = df_blob_popularity.pivot(on=\"time\", index=\"blob_count\", values=\"count\").fill_null(0)\n",
+ "\n",
+ "# Extract column order (time columns)\n",
+ "time_cols = [c for c in df_pivot.columns if c != \"blob_count\"]\n",
+ "blob_counts = df_pivot[\"blob_count\"].to_list()\n",
"\n",
"# Create epoch lookup for hover data\n",
- "epoch_lookup = df_blob_popularity.drop_duplicates(subset=[\"time\"]).set_index(\"time\")[\"epoch\"].to_dict()\n",
+ "df_epoch_lookup = df_blob_popularity.unique(subset=[\"time\"]).select([\"time\", \"epoch\"])\n",
+ "epoch_lookup = dict(zip(df_epoch_lookup[\"time\"].to_list(), df_epoch_lookup[\"epoch\"].to_list()))\n",
+ "\n",
+ "# Extract z values as numpy array\n",
+ "z_values = df_pivot.select(time_cols).to_numpy()\n",
"\n",
"fig = go.Figure(\n",
" data=go.Heatmap(\n",
- " z=df_pivot.values,\n",
- " x=df_pivot.columns,\n",
- " y=[str(int(b)) for b in df_pivot.index],\n",
+ " z=z_values,\n",
+ " x=time_cols,\n",
+ " y=[str(int(b)) for b in blob_counts],\n",
" colorscale=\"inferno\",\n",
" reversescale=False,\n",
" colorbar=dict(title=\"Block Count\"),\n",
- " customdata=[[epoch_lookup.get(t, \"\") for t in df_pivot.columns] for _ in df_pivot.index],\n",
+ " customdata=[[epoch_lookup.get(t, \"\") for t in time_cols] for _ in blob_counts],\n",
" hovertemplate=\"Epoch Time: %{x}
Epoch: %{customdata}
Blob Count: %{y}
Block Count: %{z}
Relay: {row['winning_relay']}
Block Count: {row['block_count']}\")\n",
"\n",
"# Relay -> Blob count links\n",
- "for _, row in relay_blob_flow.iterrows():\n",
+ "for row in relay_blob_flow.iter_rows(named=True):\n",
" r_node = f\"R:{row['winning_relay']}\"\n",
" bc_node = f\"{int(row['blob_count'])} blobs\"\n",
" if r_node in node_map and bc_node in node_map:\n",
diff --git a/notebooks/03-column-propagation.ipynb b/notebooks/03-column-propagation.ipynb
index 71f35d5..fd1049f 100644
--- a/notebooks/03-column-propagation.ipynb
+++ b/notebooks/03-column-propagation.ipynb
@@ -20,6 +20,8 @@
"outputs": [],
"source": [
"import numpy as np\n",
+ "import pandas as pd\n",
+ "import polars as pl\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"\n",
@@ -55,7 +57,7 @@
"outputs": [],
"source": [
"# Load column propagation data\n",
- "df_col_first_seen = load_parquet(\"col_first_seen\", target_date)\n",
+ "df_col_first_seen = pl.from_pandas(load_parquet(\"col_first_seen\", target_date))\n",
"\n",
"print(f\"Slots with column data: {len(df_col_first_seen)}\")"
]
@@ -81,11 +83,14 @@
"\n",
"# Reshape for heatmap: rows = columns (c0-c127), columns = time\n",
"col_names = [f\"c{i}\" for i in range(NUM_COLUMNS)]\n",
- "df_cols = df_col_first_seen[col_names].T\n",
- "df_cols.columns = df_col_first_seen[\"time\"]\n",
+ "\n",
+ "# Convert to pandas for reshaping (plotly needs pandas)\n",
+ "df_pd = df_col_first_seen.select([\"time\", \"slot\"] + col_names).to_pandas()\n",
+ "df_cols = df_pd[col_names].T\n",
+ "df_cols.columns = df_pd[\"time\"]\n",
"\n",
"# Create slot lookup for hover data\n",
- "slot_values = df_col_first_seen[\"slot\"].values\n",
+ "slot_values = df_pd[\"slot\"].values\n",
"\n",
"# Build customdata: slot number for each column in the heatmap\n",
"customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])\n",
@@ -129,19 +134,21 @@
"source": [
"# Compute delta from min value per slot for each column\n",
"col_names = [f\"c{i}\" for i in range(NUM_COLUMNS)]\n",
- "df_delta = df_col_first_seen.copy()\n",
"\n",
- "# Calculate row-wise minimum and subtract from each column\n",
- "row_mins = df_delta[col_names].min(axis=1)\n",
- "for col in col_names:\n",
- " df_delta[col] = df_delta[col] - row_mins\n",
+ "# Calculate row-wise minimum using polars horizontal operations\n",
+ "df_delta = df_col_first_seen.with_columns(\n",
+ " pl.min_horizontal(*col_names).alias(\"row_min\")\n",
+ ").with_columns(\n",
+ " [(pl.col(col) - pl.col(\"row_min\")).alias(col) for col in col_names]\n",
+ ").drop(\"row_min\")\n",
"\n",
- "# Reshape for heatmap\n",
- "df_delta_cols = df_delta[col_names].T\n",
- "df_delta_cols.columns = df_delta[\"time\"]\n",
+ "# Convert to pandas for reshaping (plotly needs pandas)\n",
+ "df_delta_pd = df_delta.select([\"time\", \"slot\"] + col_names).to_pandas()\n",
+ "df_delta_cols = df_delta_pd[col_names].T\n",
+ "df_delta_cols.columns = df_delta_pd[\"time\"]\n",
"\n",
"# Create slot lookup for hover data\n",
- "slot_values = df_delta[\"slot\"].values\n",
+ "slot_values = df_delta_pd[\"slot\"].values\n",
"\n",
"# Build customdata: slot number for each column in the heatmap\n",
"customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])\n",
@@ -188,22 +195,28 @@
"source": [
"# Normalize delta values to 0-1 range per slot\n",
"col_names = [f\"c{i}\" for i in range(NUM_COLUMNS)]\n",
- "df_normalized = df_col_first_seen.copy()\n",
- "\n",
- "# Calculate row-wise min and max, then normalize\n",
- "row_mins = df_normalized[col_names].min(axis=1)\n",
- "row_maxs = df_normalized[col_names].max(axis=1)\n",
- "row_ranges = row_maxs - row_mins\n",
"\n",
- "for col in col_names:\n",
- " df_normalized[col] = (df_normalized[col] - row_mins) / row_ranges.replace(0, np.nan)\n",
+ "# Calculate row-wise min, max, and range using polars horizontal operations\n",
+ "df_normalized = df_col_first_seen.with_columns([\n",
+ " pl.min_horizontal(*col_names).alias(\"row_min\"),\n",
+ " pl.max_horizontal(*col_names).alias(\"row_max\"),\n",
+ "]).with_columns(\n",
+ " (pl.col(\"row_max\") - pl.col(\"row_min\")).alias(\"row_range\")\n",
+ ").with_columns([\n",
+ " pl.when(pl.col(\"row_range\") == 0)\n",
+ " .then(None)\n",
+ " .otherwise((pl.col(col) - pl.col(\"row_min\")) / pl.col(\"row_range\"))\n",
+ " .alias(col)\n",
+ " for col in col_names\n",
+ "]).drop([\"row_min\", \"row_max\", \"row_range\"])\n",
"\n",
- "# Reshape for heatmap\n",
- "df_norm_cols = df_normalized[col_names].T\n",
- "df_norm_cols.columns = df_normalized[\"time\"]\n",
+ "# Convert to pandas for reshaping (plotly needs pandas)\n",
+ "df_norm_pd = df_normalized.select([\"time\", \"slot\"] + col_names).to_pandas()\n",
+ "df_norm_cols = df_norm_pd[col_names].T\n",
+ "df_norm_cols.columns = df_norm_pd[\"time\"]\n",
"\n",
"# Create slot lookup for hover data\n",
- "slot_values = df_normalized[\"slot\"].values\n",
+ "slot_values = df_norm_pd[\"slot\"].values\n",
"\n",
"# Build customdata: slot number for each column in the heatmap\n",
"customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])\n",
@@ -250,11 +263,16 @@
"source": [
"# Compute column spread (max - min across all columns per slot)\n",
"col_names = [f\"c{i}\" for i in range(NUM_COLUMNS)]\n",
- "df_spread = df_col_first_seen.copy()\n",
- "df_spread[\"column_spread_ms\"] = df_spread[col_names].max(axis=1) - df_spread[col_names].min(axis=1)\n",
+ "\n",
+ "df_spread = df_col_first_seen.with_columns(\n",
+ " (pl.max_horizontal(*col_names) - pl.min_horizontal(*col_names)).alias(\"column_spread_ms\")\n",
+ ")\n",
+ "\n",
+ "# Convert to pandas for plotly\n",
+ "df_spread_pd = df_spread.select([\"time\", \"slot\", \"column_spread_ms\"]).to_pandas()\n",
"\n",
"fig = px.histogram(\n",
- " df_spread,\n",
+ " df_spread_pd,\n",
" x=\"column_spread_ms\",\n",
" nbins=60,\n",
" color_discrete_sequence=[\"#EF553B\"],\n",
@@ -275,14 +293,22 @@
"metadata": {},
"outputs": [],
"source": [
- "# Summary statistics\n",
- "stats = df_spread[\"column_spread_ms\"].describe(percentiles=[0.5, 0.9, 0.95, 0.99])\n",
+ "# Summary statistics using polars\n",
+ "stats = df_spread.select(\"column_spread_ms\").to_series().describe()\n",
+ "percentiles = df_spread.select(\n",
+ " pl.col(\"column_spread_ms\").quantile(0.5).alias(\"p50\"),\n",
+ " pl.col(\"column_spread_ms\").quantile(0.9).alias(\"p90\"),\n",
+ " pl.col(\"column_spread_ms\").quantile(0.95).alias(\"p95\"),\n",
+ " pl.col(\"column_spread_ms\").quantile(0.99).alias(\"p99\"),\n",
+ " pl.col(\"column_spread_ms\").max().alias(\"max\"),\n",
+ ").row(0)\n",
+ "\n",
"print(\"Column spread (ms):\")\n",
- "print(f\" Median: {stats['50%']:.0f}\")\n",
- "print(f\" P90: {stats['90%']:.0f}\")\n",
- "print(f\" P95: {stats['95%']:.0f}\")\n",
- "print(f\" P99: {stats['99%']:.0f}\")\n",
- "print(f\" Max: {stats['max']:.0f}\")"
+ "print(f\" Median: {percentiles[0]:.0f}\")\n",
+ "print(f\" P90: {percentiles[1]:.0f}\")\n",
+ "print(f\" P95: {percentiles[2]:.0f}\")\n",
+ "print(f\" P99: {percentiles[3]:.0f}\")\n",
+ "print(f\" Max: {percentiles[4]:.0f}\")"
]
},
{
@@ -303,7 +329,7 @@
"outputs": [],
"source": [
"fig = px.scatter(\n",
- " df_spread,\n",
+ " df_spread_pd,\n",
" x=\"time\",\n",
" y=\"column_spread_ms\",\n",
" opacity=0.5,\n",
@@ -337,19 +363,28 @@
"# Missing columns heatmap - shows gaps in network coverage\n",
"col_names = [f\"c{i}\" for i in range(NUM_COLUMNS)]\n",
"\n",
- "# Create boolean mask: True (1) where column is missing (NaN)\n",
- "df_missing = df_col_first_seen[col_names].isna().astype(int).T\n",
- "df_missing.columns = df_col_first_seen[\"time\"]\n",
+ "# Count missing data using polars\n",
+ "total_missing = df_col_first_seen.select([\n",
+ " pl.col(col).is_null().sum() for col in col_names\n",
+ "]).sum_horizontal().item()\n",
+ "\n",
+ "slots_with_missing = df_col_first_seen.select(\n",
+ " pl.any_horizontal([pl.col(col).is_null() for col in col_names])\n",
+ ").sum().item()\n",
"\n",
- "# Count missing data\n",
- "total_missing = df_col_first_seen[col_names].isna().sum().sum()\n",
- "slots_with_missing = (df_col_first_seen[col_names].isna().any(axis=1)).sum()\n",
"print(f\"Total missing column observations: {total_missing:,}\")\n",
"print(f\"Slots with at least one missing column: {slots_with_missing:,} ({slots_with_missing/len(df_col_first_seen)*100:.1f}%)\")\n",
"\n",
"if total_missing > 0:\n",
+ " # Convert to pandas for the heatmap visualization\n",
+ " df_pd = df_col_first_seen.select([\"time\", \"slot\"] + col_names).to_pandas()\n",
+ " \n",
+ " # Create boolean mask: True (1) where column is missing (NaN)\n",
+ " df_missing = df_pd[col_names].isna().astype(int).T\n",
+ " df_missing.columns = df_pd[\"time\"]\n",
+ " \n",
" # Create slot lookup for hover data\n",
- " slot_values = df_col_first_seen[\"slot\"].values\n",
+ " slot_values = df_pd[\"slot\"].values\n",
" customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])\n",
"\n",
" fig = go.Figure(\n",
diff --git a/notebooks/04-mempool-visibility.ipynb b/notebooks/04-mempool-visibility.ipynb
index 7d3dd99..240a6ed 100644
--- a/notebooks/04-mempool-visibility.ipynb
+++ b/notebooks/04-mempool-visibility.ipynb
@@ -2,6 +2,7 @@
"cells": [
{
"cell_type": "markdown",
+ "id": "0",
"metadata": {},
"source": [
"Analysis of transaction visibility in the public mempool before block inclusion on Ethereum mainnet.\n",
@@ -12,6 +13,7 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "1",
"metadata": {
"tags": [
"parameters"
@@ -20,6 +22,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "import polars as pl\n",
"import numpy as np\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
@@ -57,6 +60,7 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "2",
"metadata": {
"tags": [
"sql-fold"
@@ -70,22 +74,30 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "3",
"metadata": {},
"outputs": [],
"source": [
- "df = load_parquet(\"mempool_availability\", target_date)\n",
- "df[\"tx_type_label\"] = df[\"tx_type\"].map(TX_TYPE_LABELS)\n",
- "df[\"coverage_pct\"] = df[\"seen_before_slot\"] / df[\"total_txs\"] * 100\n",
+ "df = pl.from_pandas(load_parquet(\"mempool_availability\", target_date))\n",
"\n",
- "# Calculate never seen (truly private)\n",
- "df[\"never_seen\"] = df[\"total_txs\"] - df[\"seen_before_slot\"] - df[\"seen_after_slot\"]\n",
+ "df = df.with_columns(\n",
+ " pl.col(\"tx_type\").replace_strict(TX_TYPE_LABELS, default=None).alias(\"tx_type_label\"),\n",
+ " (pl.col(\"seen_before_slot\") / pl.col(\"total_txs\") * 100).alias(\"coverage_pct\"),\n",
+ " (pl.col(\"total_txs\") - pl.col(\"seen_before_slot\") - pl.col(\"seen_after_slot\")).alias(\"never_seen\"),\n",
+ ")\n",
"\n",
"# Extract p50 age from percentiles array (index 0)\n",
- "df[\"p50_age_ms\"] = df[\"age_percentiles_ms\"].apply(lambda x: x[0] if x is not None and len(x) > 0 else np.nan)\n",
- "df[\"p50_age_s\"] = df[\"p50_age_ms\"] / 1000\n",
+ "df = df.with_columns(\n",
+ " pl.col(\"age_percentiles_ms\").list.get(0).alias(\"p50_age_ms\"),\n",
+ ")\n",
+ "df = df.with_columns(\n",
+ " (pl.col(\"p50_age_ms\") / 1000).alias(\"p50_age_s\"),\n",
+ ")\n",
"\n",
"# Add hour column for time-series aggregation\n",
- "df[\"hour\"] = df[\"slot_start_date_time\"].dt.floor(\"h\")\n",
+ "df = df.with_columns(\n",
+ " pl.col(\"slot_start_date_time\").dt.truncate(\"1h\").alias(\"hour\"),\n",
+ ")\n",
"\n",
"total = df[\"total_txs\"].sum()\n",
"before = df[\"seen_before_slot\"].sum()\n",
@@ -93,7 +105,7 @@
"never = total - before - after\n",
"\n",
"print(f\"Loaded {len(df):,} slot/type rows\")\n",
- "print(f\"Slots: {df['slot'].nunique():,}\")\n",
+ "print(f\"Slots: {df['slot'].n_unique():,}\")\n",
"print(f\"Total transactions: {total:,}\")\n",
"print(f\" Seen before slot: {before:,} ({100*before/total:.1f}%)\")\n",
"print(f\" Seen after slot: {after:,} ({100*after/total:.1f}%)\")\n",
@@ -102,6 +114,7 @@
},
{
"cell_type": "markdown",
+ "id": "4",
"metadata": {},
"source": [
"## Coverage by transaction type\n",
@@ -112,59 +125,70 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "5",
"metadata": {},
"outputs": [],
"source": [
"# Aggregate by type\n",
- "df_summary = df.groupby([\"tx_type\", \"tx_type_label\"]).agg({\n",
- " \"total_txs\": \"sum\",\n",
- " \"seen_before_slot\": \"sum\",\n",
- " \"seen_after_slot\": \"sum\",\n",
- "}).reset_index()\n",
- "df_summary[\"never_seen\"] = df_summary[\"total_txs\"] - df_summary[\"seen_before_slot\"] - df_summary[\"seen_after_slot\"]\n",
- "df_summary[\"before_pct\"] = df_summary[\"seen_before_slot\"] / df_summary[\"total_txs\"] * 100\n",
- "df_summary[\"after_pct\"] = df_summary[\"seen_after_slot\"] / df_summary[\"total_txs\"] * 100\n",
- "df_summary[\"never_pct\"] = df_summary[\"never_seen\"] / df_summary[\"total_txs\"] * 100\n",
+ "df_summary = df.group_by([\"tx_type\", \"tx_type_label\"]).agg(\n",
+ " pl.col(\"total_txs\").sum(),\n",
+ " pl.col(\"seen_before_slot\").sum(),\n",
+ " pl.col(\"seen_after_slot\").sum(),\n",
+ ")\n",
+ "df_summary = df_summary.with_columns(\n",
+ " (pl.col(\"total_txs\") - pl.col(\"seen_before_slot\") - pl.col(\"seen_after_slot\")).alias(\"never_seen\"),\n",
+ " (pl.col(\"seen_before_slot\") / pl.col(\"total_txs\") * 100).alias(\"before_pct\"),\n",
+ " (pl.col(\"seen_after_slot\") / pl.col(\"total_txs\") * 100).alias(\"after_pct\"),\n",
+ ")\n",
+ "df_summary = df_summary.with_columns(\n",
+ " (pl.col(\"never_seen\") / pl.col(\"total_txs\") * 100).alias(\"never_pct\"),\n",
+ ")\n",
"\n",
"# Display summary table\n",
- "summary_display = df_summary[[\"tx_type_label\", \"total_txs\", \"before_pct\", \"after_pct\", \"never_pct\"]].copy()\n",
- "summary_display.columns = [\"Type\", \"Total\", \"Before slot %\", \"After slot %\", \"Never seen %\"]\n",
- "for col in summary_display.columns[2:]:\n",
- " summary_display[col] = summary_display[col].round(1)\n",
- "summary_display"
+ "summary_display = df_summary.select(\n",
+ " pl.col(\"tx_type_label\").alias(\"Type\"),\n",
+ " pl.col(\"total_txs\").alias(\"Total\"),\n",
+ " pl.col(\"before_pct\").round(1).alias(\"Before slot %\"),\n",
+ " pl.col(\"after_pct\").round(1).alias(\"After slot %\"),\n",
+ " pl.col(\"never_pct\").round(1).alias(\"Never seen %\"),\n",
+ ")\n",
+ "summary_display.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "6",
"metadata": {},
"outputs": [],
"source": [
"# Coverage stacked bar chart showing before/after/never breakdown\n",
+ "df_summary_pd = df_summary.to_pandas()\n",
+ "\n",
"fig = go.Figure()\n",
"\n",
"fig.add_trace(go.Bar(\n",
- " x=df_summary[\"tx_type_label\"],\n",
- " y=df_summary[\"before_pct\"],\n",
+ " x=df_summary_pd[\"tx_type_label\"],\n",
+ " y=df_summary_pd[\"before_pct\"],\n",
" name=\"Before slot (public)\",\n",
" marker_color=\"#27ae60\",\n",
- " text=df_summary[\"before_pct\"].round(1),\n",
+ " text=df_summary_pd[\"before_pct\"].round(1),\n",
" textposition=\"inside\",\n",
"))\n",
"fig.add_trace(go.Bar(\n",
- " x=df_summary[\"tx_type_label\"],\n",
- " y=df_summary[\"after_pct\"],\n",
+ " x=df_summary_pd[\"tx_type_label\"],\n",
+ " y=df_summary_pd[\"after_pct\"],\n",
" name=\"After slot (propagated)\",\n",
" marker_color=\"#3498db\",\n",
- " text=df_summary[\"after_pct\"].round(1),\n",
+ " text=df_summary_pd[\"after_pct\"].round(1),\n",
" textposition=\"inside\",\n",
"))\n",
"fig.add_trace(go.Bar(\n",
- " x=df_summary[\"tx_type_label\"],\n",
- " y=df_summary[\"never_pct\"],\n",
+ " x=df_summary_pd[\"tx_type_label\"],\n",
+ " y=df_summary_pd[\"never_pct\"],\n",
" name=\"Never seen (private)\",\n",
" marker_color=\"#95a5a6\",\n",
- " text=df_summary[\"never_pct\"].round(1),\n",
+ " text=df_summary_pd[\"never_pct\"].round(1),\n",
" textposition=\"inside\",\n",
"))\n",
"\n",
@@ -182,6 +206,7 @@
},
{
"cell_type": "markdown",
+ "id": "7",
"metadata": {},
"source": [
"## Hourly coverage trends\n",
@@ -192,19 +217,24 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "8",
"metadata": {},
"outputs": [],
"source": [
"# Aggregate to hourly for time-series\n",
- "df_hourly = df.groupby([\"hour\", \"tx_type\", \"tx_type_label\"]).agg({\n",
- " \"total_txs\": \"sum\",\n",
- " \"seen_before_slot\": \"sum\",\n",
- " \"seen_after_slot\": \"sum\",\n",
- "}).reset_index()\n",
- "df_hourly[\"coverage_pct\"] = df_hourly[\"seen_before_slot\"] / df_hourly[\"total_txs\"] * 100\n",
+ "df_hourly = df.group_by([\"hour\", \"tx_type\", \"tx_type_label\"]).agg(\n",
+ " pl.col(\"total_txs\").sum(),\n",
+ " pl.col(\"seen_before_slot\").sum(),\n",
+ " pl.col(\"seen_after_slot\").sum(),\n",
+ ")\n",
+ "df_hourly = df_hourly.with_columns(\n",
+ " (pl.col(\"seen_before_slot\") / pl.col(\"total_txs\") * 100).alias(\"coverage_pct\"),\n",
+ ")\n",
+ "\n",
+ "df_hourly_pd = df_hourly.to_pandas()\n",
"\n",
"fig = px.line(\n",
- " df_hourly,\n",
+ " df_hourly_pd,\n",
" x=\"hour\",\n",
" y=\"coverage_pct\",\n",
" color=\"tx_type_label\",\n",
@@ -222,6 +252,7 @@
},
{
"cell_type": "markdown",
+ "id": "9",
"metadata": {},
"source": [
"## Transaction volume over time\n",
@@ -232,33 +263,38 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "10",
"metadata": {},
"outputs": [],
"source": [
"# Aggregate across types by hour - 3-way breakdown\n",
- "df_volume = df.groupby(\"hour\").agg({\n",
- " \"total_txs\": \"sum\",\n",
- " \"seen_before_slot\": \"sum\",\n",
- " \"seen_after_slot\": \"sum\",\n",
- "}).reset_index()\n",
- "df_volume[\"never_seen\"] = df_volume[\"total_txs\"] - df_volume[\"seen_before_slot\"] - df_volume[\"seen_after_slot\"]\n",
+ "df_volume = df.group_by(\"hour\").agg(\n",
+ " pl.col(\"total_txs\").sum(),\n",
+ " pl.col(\"seen_before_slot\").sum(),\n",
+ " pl.col(\"seen_after_slot\").sum(),\n",
+ ")\n",
+ "df_volume = df_volume.with_columns(\n",
+ " (pl.col(\"total_txs\") - pl.col(\"seen_before_slot\") - pl.col(\"seen_after_slot\")).alias(\"never_seen\"),\n",
+ ")\n",
+ "\n",
+ "df_volume_pd = df_volume.to_pandas()\n",
"\n",
"fig = go.Figure()\n",
"fig.add_trace(go.Bar(\n",
- " x=df_volume[\"hour\"],\n",
- " y=df_volume[\"seen_before_slot\"],\n",
+ " x=df_volume_pd[\"hour\"],\n",
+ " y=df_volume_pd[\"seen_before_slot\"],\n",
" name=\"Before slot (public)\",\n",
" marker_color=\"#27ae60\",\n",
"))\n",
"fig.add_trace(go.Bar(\n",
- " x=df_volume[\"hour\"],\n",
- " y=df_volume[\"seen_after_slot\"],\n",
+ " x=df_volume_pd[\"hour\"],\n",
+ " y=df_volume_pd[\"seen_after_slot\"],\n",
" name=\"After slot (propagated)\",\n",
" marker_color=\"#3498db\",\n",
"))\n",
"fig.add_trace(go.Bar(\n",
- " x=df_volume[\"hour\"],\n",
- " y=df_volume[\"never_seen\"],\n",
+ " x=df_volume_pd[\"hour\"],\n",
+ " y=df_volume_pd[\"never_seen\"],\n",
" name=\"Never seen (private)\",\n",
" marker_color=\"#95a5a6\",\n",
"))\n",
@@ -275,6 +311,7 @@
},
{
"cell_type": "markdown",
+ "id": "11",
"metadata": {},
"source": [
"## Coverage heatmap\n",
@@ -285,17 +322,30 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "12",
"metadata": {},
"outputs": [],
"source": [
"# Pivot for heatmap using hourly aggregated data\n",
- "df_pivot = df_hourly.pivot(index=\"tx_type_label\", columns=\"hour\", values=\"coverage_pct\").fillna(0)\n",
+ "df_pivot = df_hourly.pivot(\n",
+ " on=\"hour\",\n",
+ " index=\"tx_type_label\",\n",
+ " values=\"coverage_pct\",\n",
+ ").fill_null(0)\n",
+ "\n",
+ "# Get column order (all columns except tx_type_label, sorted)\n",
+ "value_cols = [c for c in df_pivot.columns if c != \"tx_type_label\"]\n",
+ "value_cols_sorted = sorted(value_cols)\n",
+ "\n",
+ "# Extract data for heatmap\n",
+ "z_values = df_pivot.select(value_cols_sorted).to_numpy()\n",
+ "y_labels = df_pivot[\"tx_type_label\"].to_list()\n",
"\n",
"fig = go.Figure(\n",
" data=go.Heatmap(\n",
- " z=df_pivot.values,\n",
- " x=df_pivot.columns,\n",
- " y=df_pivot.index,\n",
+ " z=z_values,\n",
+ " x=value_cols_sorted,\n",
+ " y=y_labels,\n",
" colorscale=\"Greens\",\n",
" colorbar=dict(title=dict(text=\"Coverage %\", side=\"right\")),\n",
" )\n",
@@ -311,6 +361,7 @@
},
{
"cell_type": "markdown",
+ "id": "13",
"metadata": {},
"source": [
"## Mempool age distribution\n",
@@ -321,62 +372,64 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "14",
"metadata": {},
"outputs": [],
"source": [
- "# Extract all percentiles for each type\n",
- "def extract_percentiles(group):\n",
- " # Collect all non-null percentile arrays, weighted by seen_before_slot count\n",
- " pct_arrays = []\n",
- " for _, row in group.iterrows():\n",
- " if row['seen_before_slot'] > 0 and row['age_percentiles_ms'] is not None:\n",
- " pcts = row['age_percentiles_ms']\n",
- " if not any(np.isnan(pcts)):\n",
- " pct_arrays.append(pcts)\n",
- " \n",
- " if not pct_arrays:\n",
- " return pd.Series({'p50': np.nan, 'p75': np.nan, 'p80': np.nan, 'p85': np.nan, 'p90': np.nan, 'p95': np.nan, 'p99': np.nan})\n",
- " \n",
- " # Average percentiles across slots (simple mean for now)\n",
- " avg_pcts = np.nanmean(pct_arrays, axis=0)\n",
- " return pd.Series({\n",
- " 'p50': avg_pcts[0] / 1000,\n",
- " 'p75': avg_pcts[1] / 1000,\n",
- " 'p80': avg_pcts[2] / 1000,\n",
- " 'p85': avg_pcts[3] / 1000,\n",
- " 'p90': avg_pcts[4] / 1000,\n",
- " 'p95': avg_pcts[5] / 1000,\n",
- " 'p99': avg_pcts[6] / 1000,\n",
- " })\n",
- "\n",
- "df_age = df.groupby(['tx_type', 'tx_type_label']).apply(extract_percentiles, include_groups=False).reset_index()\n",
+ "# Extract percentiles by tx_type using polars aggregation\n",
+ "# Filter to rows with valid data, then compute mean percentiles per type\n",
+ "df_with_pcts = df.filter(\n",
+ " (pl.col(\"seen_before_slot\") > 0) & \n",
+ " pl.col(\"age_percentiles_ms\").is_not_null() &\n",
+ " (pl.col(\"age_percentiles_ms\").list.len() >= 7)\n",
+ ")\n",
+ "\n",
+ "# Extract individual percentiles and compute mean per tx_type\n",
+ "df_age = df_with_pcts.group_by([\"tx_type\", \"tx_type_label\"]).agg(\n",
+ " (pl.col(\"age_percentiles_ms\").list.get(0).mean() / 1000).alias(\"p50\"),\n",
+ " (pl.col(\"age_percentiles_ms\").list.get(1).mean() / 1000).alias(\"p75\"),\n",
+ " (pl.col(\"age_percentiles_ms\").list.get(2).mean() / 1000).alias(\"p80\"),\n",
+ " (pl.col(\"age_percentiles_ms\").list.get(3).mean() / 1000).alias(\"p85\"),\n",
+ " (pl.col(\"age_percentiles_ms\").list.get(4).mean() / 1000).alias(\"p90\"),\n",
+ " (pl.col(\"age_percentiles_ms\").list.get(5).mean() / 1000).alias(\"p95\"),\n",
+ " (pl.col(\"age_percentiles_ms\").list.get(6).mean() / 1000).alias(\"p99\"),\n",
+ ")\n",
"\n",
"# Display age table\n",
- "age_display = df_age[['tx_type_label', 'p50', 'p75', 'p90', 'p95', 'p99']].copy()\n",
- "age_display.columns = ['Type', 'p50 (s)', 'p75 (s)', 'p90 (s)', 'p95 (s)', 'p99 (s)']\n",
- "for col in age_display.columns[1:]:\n",
- " age_display[col] = age_display[col].round(1)\n",
- "age_display"
+ "age_display = df_age.select(\n",
+ " pl.col(\"tx_type_label\").alias(\"Type\"),\n",
+ " pl.col(\"p50\").round(1).alias(\"p50 (s)\"),\n",
+ " pl.col(\"p75\").round(1).alias(\"p75 (s)\"),\n",
+ " pl.col(\"p90\").round(1).alias(\"p90 (s)\"),\n",
+ " pl.col(\"p95\").round(1).alias(\"p95 (s)\"),\n",
+ " pl.col(\"p99\").round(1).alias(\"p99 (s)\"),\n",
+ ")\n",
+ "age_display.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "15",
"metadata": {},
"outputs": [],
"source": [
"# Visualize age percentiles as line chart\n",
- "df_age_long = df_age.melt(\n",
- " id_vars=['tx_type', 'tx_type_label'],\n",
- " value_vars=['p50', 'p75', 'p80', 'p85', 'p90', 'p95', 'p99'],\n",
- " var_name='percentile',\n",
- " value_name='age_s'\n",
+ "df_age_long = df_age.unpivot(\n",
+ " index=[\"tx_type\", \"tx_type_label\"],\n",
+ " on=[\"p50\", \"p75\", \"p80\", \"p85\", \"p90\", \"p95\", \"p99\"],\n",
+ " variable_name=\"percentile\",\n",
+ " value_name=\"age_s\",\n",
")\n",
"# Convert percentile labels to numeric for x-axis\n",
- "df_age_long['pct_num'] = df_age_long['percentile'].str.replace('p', '').astype(int)\n",
+ "df_age_long = df_age_long.with_columns(\n",
+ " pl.col(\"percentile\").str.replace(\"p\", \"\").cast(pl.Int64).alias(\"pct_num\"),\n",
+ ")\n",
+ "\n",
+ "df_age_long_pd = df_age_long.to_pandas()\n",
"\n",
"fig = px.line(\n",
- " df_age_long,\n",
+ " df_age_long_pd,\n",
" x='pct_num',\n",
" y='age_s',\n",
" color='tx_type_label',\n",
@@ -397,28 +450,37 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "16",
"metadata": {},
"outputs": [],
"source": [
"# Aggregate histogram buckets across all slots per tx type\n",
"hist_cols = [f'age_hist_{i}' for i in range(15)]\n",
- "df_hist = df.groupby(['tx_type', 'tx_type_label'])[hist_cols].sum().reset_index()\n",
+ "df_hist = df.group_by([\"tx_type\", \"tx_type_label\"]).agg(\n",
+ " [pl.col(c).sum() for c in hist_cols]\n",
+ ")\n",
"\n",
"# Melt to long format for plotting\n",
- "df_hist_long = df_hist.melt(\n",
- " id_vars=['tx_type', 'tx_type_label'],\n",
- " value_vars=hist_cols,\n",
- " var_name='bucket',\n",
- " value_name='count'\n",
+ "df_hist_long = df_hist.unpivot(\n",
+ " index=[\"tx_type\", \"tx_type_label\"],\n",
+ " on=hist_cols,\n",
+ " variable_name=\"bucket\",\n",
+ " value_name=\"count\",\n",
+ ")\n",
+ "df_hist_long = df_hist_long.with_columns(\n",
+ " pl.col(\"bucket\").str.extract(r\"(\\d+)\").cast(pl.Int64).alias(\"bucket_idx\"),\n",
+ ")\n",
+ "df_hist_long = df_hist_long.with_columns(\n",
+ " pl.col(\"bucket_idx\").replace_strict(dict(enumerate(HIST_LABELS)), default=None).alias(\"bucket_label\"),\n",
")\n",
- "df_hist_long['bucket_idx'] = df_hist_long['bucket'].str.extract(r'(\\d+)').astype(int)\n",
- "df_hist_long['bucket_label'] = df_hist_long['bucket_idx'].map(dict(enumerate(HIST_LABELS)))\n",
"\n",
"# Sort by bucket index for proper ordering\n",
- "df_hist_long = df_hist_long.sort_values(['tx_type', 'bucket_idx'])\n",
+ "df_hist_long = df_hist_long.sort([\"tx_type\", \"bucket_idx\"])\n",
+ "\n",
+ "df_hist_long_pd = df_hist_long.to_pandas()\n",
"\n",
"fig = px.bar(\n",
- " df_hist_long,\n",
+ " df_hist_long_pd,\n",
" x='bucket_label',\n",
" y='count',\n",
" color='tx_type_label',\n",
@@ -441,6 +503,7 @@
},
{
"cell_type": "markdown",
+ "id": "17",
"metadata": {},
"source": [
"## Propagation delay (seen after slot)\n",
@@ -451,60 +514,62 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "18",
"metadata": {},
"outputs": [],
"source": [
"# Extract delay percentiles for transactions seen AFTER slot start\n",
- "def extract_delay_percentiles(group):\n",
- " pct_arrays = []\n",
- " for _, row in group.iterrows():\n",
- " if row['seen_after_slot'] > 0 and row['delay_percentiles_ms'] is not None:\n",
- " pcts = row['delay_percentiles_ms']\n",
- " if not any(np.isnan(pcts)):\n",
- " pct_arrays.append(pcts)\n",
- " \n",
- " if not pct_arrays:\n",
- " return pd.Series({'p50': np.nan, 'p75': np.nan, 'p80': np.nan, 'p85': np.nan, 'p90': np.nan, 'p95': np.nan, 'p99': np.nan})\n",
- " \n",
- " avg_pcts = np.nanmean(pct_arrays, axis=0)\n",
- " return pd.Series({\n",
- " 'p50': avg_pcts[0] / 1000,\n",
- " 'p75': avg_pcts[1] / 1000,\n",
- " 'p80': avg_pcts[2] / 1000,\n",
- " 'p85': avg_pcts[3] / 1000,\n",
- " 'p90': avg_pcts[4] / 1000,\n",
- " 'p95': avg_pcts[5] / 1000,\n",
- " 'p99': avg_pcts[6] / 1000,\n",
- " })\n",
- "\n",
- "df_delay = df.groupby(['tx_type', 'tx_type_label']).apply(extract_delay_percentiles, include_groups=False).reset_index()\n",
+ "df_with_delay = df.filter(\n",
+ " (pl.col(\"seen_after_slot\") > 0) & \n",
+ " pl.col(\"delay_percentiles_ms\").is_not_null() &\n",
+ " (pl.col(\"delay_percentiles_ms\").list.len() >= 7)\n",
+ ")\n",
+ "\n",
+ "df_delay = df_with_delay.group_by([\"tx_type\", \"tx_type_label\"]).agg(\n",
+ " (pl.col(\"delay_percentiles_ms\").list.get(0).mean() / 1000).alias(\"p50\"),\n",
+ " (pl.col(\"delay_percentiles_ms\").list.get(1).mean() / 1000).alias(\"p75\"),\n",
+ " (pl.col(\"delay_percentiles_ms\").list.get(2).mean() / 1000).alias(\"p80\"),\n",
+ " (pl.col(\"delay_percentiles_ms\").list.get(3).mean() / 1000).alias(\"p85\"),\n",
+ " (pl.col(\"delay_percentiles_ms\").list.get(4).mean() / 1000).alias(\"p90\"),\n",
+ " (pl.col(\"delay_percentiles_ms\").list.get(5).mean() / 1000).alias(\"p95\"),\n",
+ " (pl.col(\"delay_percentiles_ms\").list.get(6).mean() / 1000).alias(\"p99\"),\n",
+ ")\n",
"\n",
"# Display delay table\n",
- "delay_display = df_delay[['tx_type_label', 'p50', 'p75', 'p90', 'p95', 'p99']].copy()\n",
- "delay_display.columns = ['Type', 'p50 (s)', 'p75 (s)', 'p90 (s)', 'p95 (s)', 'p99 (s)']\n",
- "for col in delay_display.columns[1:]:\n",
- " delay_display[col] = delay_display[col].round(2)\n",
- "delay_display"
+ "delay_display = df_delay.select(\n",
+ " pl.col(\"tx_type_label\").alias(\"Type\"),\n",
+ " pl.col(\"p50\").round(2).alias(\"p50 (s)\"),\n",
+ " pl.col(\"p75\").round(2).alias(\"p75 (s)\"),\n",
+ " pl.col(\"p90\").round(2).alias(\"p90 (s)\"),\n",
+ " pl.col(\"p95\").round(2).alias(\"p95 (s)\"),\n",
+ " pl.col(\"p99\").round(2).alias(\"p99 (s)\"),\n",
+ ")\n",
+ "delay_display.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "19",
"metadata": {},
"outputs": [],
"source": [
"# Visualize delay percentiles as line chart\n",
- "df_delay_long = df_delay.melt(\n",
- " id_vars=['tx_type', 'tx_type_label'],\n",
- " value_vars=['p50', 'p75', 'p80', 'p85', 'p90', 'p95', 'p99'],\n",
- " var_name='percentile',\n",
- " value_name='delay_s'\n",
+ "df_delay_long = df_delay.unpivot(\n",
+ " index=[\"tx_type\", \"tx_type_label\"],\n",
+ " on=[\"p50\", \"p75\", \"p80\", \"p85\", \"p90\", \"p95\", \"p99\"],\n",
+ " variable_name=\"percentile\",\n",
+ " value_name=\"delay_s\",\n",
")\n",
"# Convert percentile labels to numeric for x-axis\n",
- "df_delay_long['pct_num'] = df_delay_long['percentile'].str.replace('p', '').astype(int)\n",
+ "df_delay_long = df_delay_long.with_columns(\n",
+ " pl.col(\"percentile\").str.replace(\"p\", \"\").cast(pl.Int64).alias(\"pct_num\"),\n",
+ ")\n",
+ "\n",
+ "df_delay_long_pd = df_delay_long.to_pandas()\n",
"\n",
"fig = px.line(\n",
- " df_delay_long,\n",
+ " df_delay_long_pd,\n",
" x='pct_num',\n",
" y='delay_s',\n",
" color='tx_type_label',\n",
@@ -525,28 +590,37 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "20",
"metadata": {},
"outputs": [],
"source": [
"# Aggregate delay histogram buckets across all slots per tx type\n",
"delay_hist_cols = [f'delay_hist_{i}' for i in range(15)]\n",
- "df_delay_hist = df.groupby(['tx_type', 'tx_type_label'])[delay_hist_cols].sum().reset_index()\n",
+ "df_delay_hist = df.group_by([\"tx_type\", \"tx_type_label\"]).agg(\n",
+ " [pl.col(c).sum() for c in delay_hist_cols]\n",
+ ")\n",
"\n",
"# Melt to long format for plotting\n",
- "df_delay_hist_long = df_delay_hist.melt(\n",
- " id_vars=['tx_type', 'tx_type_label'],\n",
- " value_vars=delay_hist_cols,\n",
- " var_name='bucket',\n",
- " value_name='count'\n",
+ "df_delay_hist_long = df_delay_hist.unpivot(\n",
+ " index=[\"tx_type\", \"tx_type_label\"],\n",
+ " on=delay_hist_cols,\n",
+ " variable_name=\"bucket\",\n",
+ " value_name=\"count\",\n",
+ ")\n",
+ "df_delay_hist_long = df_delay_hist_long.with_columns(\n",
+ " pl.col(\"bucket\").str.extract(r\"(\\d+)\").cast(pl.Int64).alias(\"bucket_idx\"),\n",
+ ")\n",
+ "df_delay_hist_long = df_delay_hist_long.with_columns(\n",
+ " pl.col(\"bucket_idx\").replace_strict(dict(enumerate(HIST_LABELS)), default=None).alias(\"bucket_label\"),\n",
")\n",
- "df_delay_hist_long['bucket_idx'] = df_delay_hist_long['bucket'].str.extract(r'(\\d+)').astype(int)\n",
- "df_delay_hist_long['bucket_label'] = df_delay_hist_long['bucket_idx'].map(dict(enumerate(HIST_LABELS)))\n",
"\n",
"# Sort by bucket index for proper ordering\n",
- "df_delay_hist_long = df_delay_hist_long.sort_values(['tx_type', 'bucket_idx'])\n",
+ "df_delay_hist_long = df_delay_hist_long.sort([\"tx_type\", \"bucket_idx\"])\n",
+ "\n",
+ "df_delay_hist_long_pd = df_delay_hist_long.to_pandas()\n",
"\n",
"fig = px.bar(\n",
- " df_delay_hist_long,\n",
+ " df_delay_hist_long_pd,\n",
" x='bucket_label',\n",
" y='count',\n",
" color='tx_type_label',\n",
@@ -569,6 +643,7 @@
},
{
"cell_type": "markdown",
+ "id": "21",
"metadata": {},
"source": [
"## Sentry coverage\n",
@@ -579,6 +654,7 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "22",
"metadata": {
"tags": [
"sql-fold"
@@ -592,16 +668,21 @@
{
"cell_type": "code",
"execution_count": null,
+ "id": "23",
"metadata": {},
"outputs": [],
"source": [
- "df_sentry = load_parquet(\"sentry_coverage\", target_date)\n",
+ "df_sentry = pl.from_pandas(load_parquet(\"sentry_coverage\", target_date))\n",
"\n",
"# Shorten sentry names for display\n",
- "df_sentry[\"sentry_short\"] = df_sentry[\"sentry\"].str.replace(\"ethpandaops/mainnet/\", \"\")\n",
+ "df_sentry = df_sentry.with_columns(\n",
+ " pl.col(\"sentry\").str.replace(\"ethpandaops/mainnet/\", \"\").alias(\"sentry_short\"),\n",
+ ")\n",
+ "\n",
+ "df_sentry_pd = df_sentry.head(15).to_pandas()\n",
"\n",
"fig = px.bar(\n",
- " df_sentry.head(15),\n",
+ " df_sentry_pd,\n",
" x=\"coverage_pct\",\n",
" y=\"sentry_short\",\n",
" orientation=\"h\",\n",
@@ -631,5 +712,5 @@
}
},
"nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
}
diff --git a/notebooks/05-mev-pipeline.ipynb b/notebooks/05-mev-pipeline.ipynb
index 5f835ec..061598b 100644
--- a/notebooks/05-mev-pipeline.ipynb
+++ b/notebooks/05-mev-pipeline.ipynb
@@ -2,7 +2,6 @@
"cells": [
{
"cell_type": "markdown",
- "id": "0",
"metadata": {},
"source": [
"Analysis of MEV pipeline timing and its effect on block propagation on Ethereum mainnet."
@@ -11,7 +10,6 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "1",
"metadata": {
"tags": [
"parameters"
@@ -20,6 +18,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "import polars as pl\n",
"import numpy as np\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
@@ -34,10 +33,8 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "2",
"metadata": {
"tags": [
- "hide-input",
"sql-fold"
]
},
@@ -49,24 +46,29 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "3",
"metadata": {},
"outputs": [],
"source": [
- "df = load_parquet(\"block_production_timeline\", target_date)\n",
+ "df = pl.from_pandas(load_parquet(\"block_production_timeline\", target_date))\n",
"\n",
"# Flag MEV vs local blocks\n",
- "df[\"has_mev\"] = df[\"winning_bid_value\"].notna()\n",
- "df[\"block_type\"] = df[\"has_mev\"].map({True: \"MEV\", False: \"Local\"})\n",
+ "df = df.with_columns(\n",
+ " pl.col(\"winning_bid_value\").is_not_null().alias(\"has_mev\")\n",
+ ").with_columns(\n",
+ " pl.when(pl.col(\"has_mev\")).then(pl.lit(\"MEV\")).otherwise(pl.lit(\"Local\")).alias(\"block_type\")\n",
+ ")\n",
+ "\n",
+ "total_slots = len(df)\n",
+ "mev_count = df.filter(pl.col(\"has_mev\")).height\n",
+ "local_count = df.filter(~pl.col(\"has_mev\")).height\n",
"\n",
- "print(f\"Total slots: {len(df):,}\")\n",
- "print(f\"MEV blocks: {df['has_mev'].sum():,} ({df['has_mev'].mean()*100:.1f}%)\")\n",
- "print(f\"Local blocks: {(~df['has_mev']).sum():,} ({(~df['has_mev']).mean()*100:.1f}%)\")"
+ "print(f\"Total slots: {total_slots:,}\")\n",
+ "print(f\"MEV blocks: {mev_count:,} ({mev_count/total_slots*100:.1f}%)\")\n",
+ "print(f\"Local blocks: {local_count:,} ({local_count/total_slots*100:.1f}%)\")"
]
},
{
"cell_type": "markdown",
- "id": "4",
"metadata": {},
"source": [
"## Bid trace coverage\n",
@@ -82,27 +84,27 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "5",
"metadata": {},
"outputs": [],
"source": [
"# Bid trace coverage analysis\n",
- "df_trace = df[df[\"has_mev\"]].copy()\n",
- "df_trace[\"relay\"] = df_trace[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else \"Unknown\")\n",
- "df_trace[\"has_bid_timing\"] = df_trace[\"winning_bid_ms\"].notna()\n",
+ "df_trace = df.filter(pl.col(\"has_mev\")).with_columns(\n",
+ " pl.col(\"winning_relays\").list.get(0).fill_null(\"Unknown\").alias(\"relay\"),\n",
+ " pl.col(\"winning_bid_ms\").is_not_null().alias(\"has_bid_timing\")\n",
+ ")\n",
"\n",
"# Aggregate by relay\n",
- "relay_coverage = df_trace.groupby(\"relay\").agg(\n",
- " total=(\"slot\", \"count\"),\n",
- " with_timing=(\"has_bid_timing\", \"sum\"),\n",
- ").reset_index()\n",
- "relay_coverage[\"without_timing\"] = relay_coverage[\"total\"] - relay_coverage[\"with_timing\"]\n",
- "relay_coverage[\"pct_with_timing\"] = (relay_coverage[\"with_timing\"] / relay_coverage[\"total\"] * 100).round(1)\n",
- "relay_coverage = relay_coverage.sort_values(\"total\", ascending=True)\n",
+ "relay_coverage = df_trace.group_by(\"relay\").agg(\n",
+ " pl.col(\"slot\").count().alias(\"total\"),\n",
+ " pl.col(\"has_bid_timing\").sum().alias(\"with_timing\"),\n",
+ ").with_columns(\n",
+ " (pl.col(\"total\") - pl.col(\"with_timing\")).alias(\"without_timing\"),\n",
+ " (pl.col(\"with_timing\") / pl.col(\"total\") * 100).round(1).alias(\"pct_with_timing\")\n",
+ ").sort(\"total\")\n",
"\n",
"# Summary stats\n",
- "total_mev = relay_coverage[\"total\"].sum()\n",
- "total_with_timing = relay_coverage[\"with_timing\"].sum()\n",
+ "total_mev = relay_coverage.select(pl.col(\"total\").sum()).item()\n",
+ "total_with_timing = relay_coverage.select(pl.col(\"with_timing\").sum()).item()\n",
"print(f\"MEV blocks: {total_mev:,}\")\n",
"print(f\"With bid timing: {total_with_timing:,} ({total_with_timing/total_mev*100:.1f}%)\")\n",
"print(f\"Without bid timing: {total_mev - total_with_timing:,} ({(total_mev - total_with_timing)/total_mev*100:.1f}%)\")"
@@ -111,27 +113,28 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "6",
"metadata": {},
"outputs": [],
"source": [
"# Stacked horizontal bar chart\n",
+ "relay_coverage_pd = relay_coverage.to_pandas()\n",
+ "\n",
"fig = go.Figure()\n",
"\n",
"fig.add_trace(go.Bar(\n",
- " y=relay_coverage[\"relay\"],\n",
- " x=relay_coverage[\"with_timing\"],\n",
+ " y=relay_coverage_pd[\"relay\"],\n",
+ " x=relay_coverage_pd[\"with_timing\"],\n",
" name=\"With bid timing\",\n",
" orientation=\"h\",\n",
" marker_color=\"#2ecc71\",\n",
- " text=relay_coverage.apply(lambda r: f\"{r['pct_with_timing']:.0f}%\" if r['with_timing'] > 0 else \"\", axis=1),\n",
+ " text=relay_coverage_pd.apply(lambda r: f\"{r['pct_with_timing']:.0f}%\" if r['with_timing'] > 0 else \"\", axis=1),\n",
" textposition=\"inside\",\n",
" hovertemplate=\"%{y}
With timing: %{x:,}
({count:,} blocks)\", font_size=9, yshift=8)\n",
" \n",
" # Add axis titles and ensure ticks are visible on all panels\n",
@@ -773,7 +788,6 @@
},
{
"cell_type": "markdown",
- "id": "30",
"metadata": {},
"source": [
"### Block propagation by relay\n",
@@ -784,47 +798,56 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "31",
"metadata": {},
"outputs": [],
"source": [
"# Prepare relay data for heatmap\n",
- "df_relay_heat = df_mev_only.dropna(subset=[\"block_first_seen_ms\"]).copy()\n",
- "df_relay_heat[\"relay\"] = df_relay_heat[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else None)\n",
- "df_relay_heat = df_relay_heat.dropna(subset=[\"relay\"])\n",
+ "df_relay_heat = df_mev_only.filter(\n",
+ " pl.col(\"block_first_seen_ms\").is_not_null()\n",
+ ").with_columns(\n",
+ " pl.col(\"winning_relays\").list.get(0).alias(\"relay\")\n",
+ ").filter(pl.col(\"relay\").is_not_null())\n",
"\n",
"# Get top relays by block count, sorted descending\n",
- "relay_counts = df_relay_heat[\"relay\"].value_counts()\n",
- "top_relays = relay_counts.head(9).index.tolist()\n",
+ "relay_counts = df_relay_heat.group_by(\"relay\").agg(\n",
+ " pl.count().alias(\"count\")\n",
+ ").sort(\"count\", descending=True)\n",
+ "\n",
+ "top_relays = relay_counts.head(9).select(\"relay\").to_series().to_list()\n",
"relay_order = top_relays # Already sorted by count descending\n",
- "df_relay_top = df_relay_heat[df_relay_heat[\"relay\"].isin(top_relays)].copy()\n",
+ "df_relay_top = df_relay_heat.filter(pl.col(\"relay\").is_in(top_relays))\n",
"\n",
"# Print relay stats\n",
"print(\"Top relays by block count:\")\n",
+ "relay_counts_dict = dict(zip(\n",
+ " relay_counts.select(\"relay\").to_series().to_list(),\n",
+ " relay_counts.select(\"count\").to_series().to_list()\n",
+ "))\n",
"for relay in relay_order:\n",
- " count = relay_counts[relay]\n",
- " median_ms = df_relay_top[df_relay_top[\"relay\"] == relay][\"block_first_seen_ms\"].median()\n",
+ " count = relay_counts_dict[relay]\n",
+ " median_ms = df_relay_top.filter(pl.col(\"relay\") == relay).select(pl.col(\"block_first_seen_ms\").median()).item()\n",
" print(f\" {relay}: {count:,} blocks, median {median_ms:.0f}ms\")"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "32",
"metadata": {},
"outputs": [],
"source": [
"if len(df_relay_top) > 0:\n",
+ " df_relay_top_pd = df_relay_top.to_pandas()\n",
+ " \n",
" n_relays = len(top_relays)\n",
" n_cols = 3\n",
" n_rows = (n_relays + n_cols - 1) // n_cols\n",
" \n",
" # Create 100ms bins for x-axis (block timing)\n",
- " x_max = df_relay_top[\"block_first_seen_ms\"].quantile(0.99) # Trim outliers\n",
+ " x_max = df_relay_top.select(pl.col(\"block_first_seen_ms\").quantile(0.99)).item()\n",
" x_bins = int(x_max // 100) + 1\n",
" \n",
" fig = px.density_heatmap(\n",
- " df_relay_top,\n",
+ " df_relay_top_pd,\n",
" x=\"block_first_seen_ms\",\n",
" y=\"blob_count\",\n",
" facet_col=\"relay\",\n",
@@ -847,7 +870,7 @@
" # Clean up facet titles - add block count\n",
" for ann in fig.layout.annotations:\n",
" relay = ann.text.replace(\"relay=\", \"\")\n",
- " count = relay_counts.get(relay, 0)\n",
+ " count = relay_counts_dict.get(relay, 0)\n",
" ann.update(text=f\"{relay}
({count:,} blocks)\", font_size=9, yshift=8)\n",
" \n",
" # Add axis titles and ensure ticks are visible on all panels\n",
@@ -873,7 +896,6 @@
},
{
"cell_type": "markdown",
- "id": "33",
"metadata": {},
"source": [
"## Bid timing density\n",
@@ -884,19 +906,24 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "34",
"metadata": {},
"outputs": [],
"source": [
"# Density contour with outlier markers\n",
- "df_timing = df_mev_only.dropna(subset=[\"winning_bid_ms\", \"bid_to_block_ms\"]).copy()\n",
- "df_timing = df_timing[(df_timing[\"bid_to_block_ms\"] > 0) & (df_timing[\"bid_to_block_ms\"] < 5000)]\n",
+ "df_timing = df_mev_only.filter(\n",
+ " pl.col(\"winning_bid_ms\").is_not_null() &\n",
+ " pl.col(\"bid_to_block_ms\").is_not_null() &\n",
+ " (pl.col(\"bid_to_block_ms\") > 0) &\n",
+ " (pl.col(\"bid_to_block_ms\") < 5000)\n",
+ ")\n",
"\n",
"if len(df_timing) > 0:\n",
+ " df_timing_pd = df_timing.to_pandas()\n",
+ " \n",
" fig = go.Figure()\n",
" \n",
" # Density contour base\n",
- " contour = px.density_contour(df_timing, x=\"winning_bid_ms\", y=\"bid_to_block_ms\")\n",
+ " contour = px.density_contour(df_timing_pd, x=\"winning_bid_ms\", y=\"bid_to_block_ms\")\n",
" for trace in contour.data:\n",
" trace.update(\n",
" contours_coloring=\"fill\", \n",
@@ -908,9 +935,11 @@
" fig.add_trace(trace)\n",
" \n",
" # Outliers (P95+ on either axis)\n",
- " q95_x = df_timing[\"winning_bid_ms\"].quantile(0.95)\n",
- " q95_y = df_timing[\"bid_to_block_ms\"].quantile(0.95)\n",
- " outliers = df_timing[(df_timing[\"winning_bid_ms\"] > q95_x) | (df_timing[\"bid_to_block_ms\"] > q95_y)]\n",
+ " q95_x = df_timing.select(pl.col(\"winning_bid_ms\").quantile(0.95)).item()\n",
+ " q95_y = df_timing.select(pl.col(\"bid_to_block_ms\").quantile(0.95)).item()\n",
+ " outliers = df_timing.filter(\n",
+ " (pl.col(\"winning_bid_ms\") > q95_x) | (pl.col(\"bid_to_block_ms\") > q95_y)\n",
+ " ).to_pandas()\n",
" \n",
" fig.add_trace(go.Scatter(\n",
" x=outliers[\"winning_bid_ms\"],\n",
@@ -938,7 +967,6 @@
},
{
"cell_type": "markdown",
- "id": "35",
"metadata": {},
"source": [
"## Bid timing by blob count\n",
@@ -949,23 +977,28 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "36",
"metadata": {},
"outputs": [],
"source": [
"# Faceted heatmap by blob count (same bins as other charts)\n",
- "df_timing = df_mev_only.dropna(subset=[\"winning_bid_ms\", \"bid_to_block_ms\"]).copy()\n",
- "df_timing = df_timing[(df_timing[\"bid_to_block_ms\"] > 0) & (df_timing[\"bid_to_block_ms\"] < 5000)]\n",
+ "df_timing = df_mev_only.filter(\n",
+ " pl.col(\"winning_bid_ms\").is_not_null() &\n",
+ " pl.col(\"bid_to_block_ms\").is_not_null() &\n",
+ " (pl.col(\"bid_to_block_ms\") > 0) &\n",
+ " (pl.col(\"bid_to_block_ms\") < 5000)\n",
+ ")\n",
"\n",
"if len(df_timing) > 0:\n",
+ " df_timing_pd = df_timing.to_pandas()\n",
+ " \n",
" n_rows = (len(BLOB_BIN_ORDER) + 2) // 3\n",
" \n",
" # Calculate axis ranges\n",
- " x_max = df_timing[\"winning_bid_ms\"].quantile(0.99)\n",
- " y_max = df_timing[\"bid_to_block_ms\"].quantile(0.99)\n",
+ " x_max = df_timing.select(pl.col(\"winning_bid_ms\").quantile(0.99)).item()\n",
+ " y_max = df_timing.select(pl.col(\"bid_to_block_ms\").quantile(0.99)).item()\n",
"\n",
" fig = px.density_heatmap(\n",
- " df_timing,\n",
+ " df_timing_pd,\n",
" x=\"winning_bid_ms\",\n",
" y=\"bid_to_block_ms\",\n",
" facet_col=\"blob_bin\",\n",
@@ -1014,15 +1047,15 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
- "version": "3.11.0"
+ "version": "3.12.0"
}
},
"nbformat": 4,
- "nbformat_minor": 5
+ "nbformat_minor": 4
}
diff --git a/notebooks/06-block-column-timing.ipynb b/notebooks/06-block-column-timing.ipynb
index f3f4749..f20c9a7 100644
--- a/notebooks/06-block-column-timing.ipynb
+++ b/notebooks/06-block-column-timing.ipynb
@@ -19,7 +19,8 @@
},
"outputs": [],
"source": [
- "import pandas as pd\n",
+ "import polars as pl\n",
+ "import pandas as pd # Required for plotly\n",
"import numpy as np\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
@@ -50,24 +51,37 @@
"metadata": {},
"outputs": [],
"source": [
- "df = load_parquet(\"block_production_timeline\", target_date)\n",
+ "df = pl.from_pandas(load_parquet(\"block_production_timeline\", target_date))\n",
"\n",
"# Flag MEV vs local blocks\n",
- "df[\"has_mev\"] = df[\"winning_bid_value\"].notna()\n",
- "df[\"block_type\"] = df[\"has_mev\"].map({True: \"MEV\", False: \"Local\"})\n",
+ "df = df.with_columns(\n",
+ " pl.col(\"winning_bid_value\").is_not_null().alias(\"has_mev\"),\n",
+ ")\n",
+ "df = df.with_columns(\n",
+ " pl.when(pl.col(\"has_mev\")).then(pl.lit(\"MEV\")).otherwise(pl.lit(\"Local\")).alias(\"block_type\"),\n",
+ ")\n",
"\n",
"# Filter to slots with blobs\n",
- "df[\"has_blobs\"] = df[\"blob_count\"] > 0\n",
- "df_blobs = df[df[\"has_blobs\"]].copy()\n",
+ "df = df.with_columns(\n",
+ " (pl.col(\"blob_count\") > 0).alias(\"has_blobs\"),\n",
+ ")\n",
+ "df_blobs = df.filter(pl.col(\"has_blobs\"))\n",
"\n",
"# Calculate block to first column delay\n",
- "df_blobs = df_blobs.dropna(subset=[\"block_first_seen_ms\", \"first_column_first_seen_ms\"])\n",
- "df_blobs[\"block_to_column_ms\"] = df_blobs[\"first_column_first_seen_ms\"] - df_blobs[\"block_first_seen_ms\"]\n",
+ "df_blobs = df_blobs.drop_nulls(subset=[\"block_first_seen_ms\", \"first_column_first_seen_ms\"])\n",
+ "df_blobs = df_blobs.with_columns(\n",
+ " (pl.col(\"first_column_first_seen_ms\") - pl.col(\"block_first_seen_ms\")).alias(\"block_to_column_ms\"),\n",
+ ")\n",
"\n",
- "print(f\"Total slots: {len(df):,}\")\n",
- "print(f\"Slots with blobs: {len(df_blobs):,} ({len(df_blobs)/len(df)*100:.1f}%)\")\n",
- "print(f\" MEV: {df_blobs['has_mev'].sum():,} ({df_blobs['has_mev'].mean()*100:.1f}%)\")\n",
- "print(f\" Local: {(~df_blobs['has_mev']).sum():,} ({(~df_blobs['has_mev']).mean()*100:.1f}%)\")"
+ "total_slots = len(df)\n",
+ "blob_slots = len(df_blobs)\n",
+ "mev_count = df_blobs.filter(pl.col(\"has_mev\")).height\n",
+ "local_count = df_blobs.filter(~pl.col(\"has_mev\")).height\n",
+ "\n",
+ "print(f\"Total slots: {total_slots:,}\")\n",
+ "print(f\"Slots with blobs: {blob_slots:,} ({blob_slots/total_slots*100:.1f}%)\")\n",
+ "print(f\" MEV: {mev_count:,} ({mev_count/blob_slots*100:.1f}%)\")\n",
+ "print(f\" Local: {local_count:,} ({local_count/blob_slots*100:.1f}%)\")"
]
},
{
@@ -91,7 +105,7 @@
"source": [
"if len(df_blobs) > 0:\n",
" fig = px.histogram(\n",
- " df_blobs,\n",
+ " df_blobs.to_pandas(),\n",
" x=\"block_to_column_ms\",\n",
" color=\"block_type\",\n",
" category_orders={\"block_type\": [\"MEV\", \"Local\"]},\n",
@@ -121,13 +135,19 @@
"source": [
"# Summary statistics\n",
"if len(df_blobs) > 0:\n",
- " stats = df_blobs[\"block_to_column_ms\"].describe(percentiles=[0.5, 0.9, 0.95, 0.99])\n",
+ " block_to_column = df_blobs[\"block_to_column_ms\"]\n",
+ " median = block_to_column.median()\n",
+ " p90 = block_to_column.quantile(0.9)\n",
+ " p95 = block_to_column.quantile(0.95)\n",
+ " p99 = block_to_column.quantile(0.99)\n",
+ " max_val = block_to_column.max()\n",
+ " \n",
" print(\"Block to first column (ms):\")\n",
- " print(f\" Median: {stats['50%']:.0f}\")\n",
- " print(f\" P90: {stats['90%']:.0f}\")\n",
- " print(f\" P95: {stats['95%']:.0f}\")\n",
- " print(f\" P99: {stats['99%']:.0f}\")\n",
- " print(f\" Max: {stats['max']:.0f}\")"
+ " print(f\" Median: {median:.0f}\")\n",
+ " print(f\" P90: {p90:.0f}\")\n",
+ " print(f\" P95: {p95:.0f}\")\n",
+ " print(f\" P99: {p99:.0f}\")\n",
+ " print(f\" Max: {max_val:.0f}\")"
]
},
{
@@ -148,12 +168,13 @@
"outputs": [],
"source": [
"if len(df_blobs) > 0:\n",
- " df_plot = df_blobs.copy()\n",
- " df_plot[\"blob_count_f\"] = df_plot[\"blob_count\"].astype(float) # Force continuous color\n",
+ " df_plot = df_blobs.with_columns(\n",
+ " pl.col(\"blob_count\").cast(pl.Float64).alias(\"blob_count_f\"),\n",
+ " )\n",
" max_blobs = df_plot[\"blob_count\"].max()\n",
" \n",
" fig = px.scatter(\n",
- " df_plot,\n",
+ " df_plot.to_pandas(),\n",
" x=\"slot_start_date_time\",\n",
" y=\"block_to_column_ms\",\n",
" color=\"blob_count_f\",\n",
@@ -192,10 +213,10 @@
"outputs": [],
"source": [
"# Filter to slots with blobs (column_spread only exists for blob slots)\n",
- "df_col_spread = df[df[\"blob_count\"] > 0].dropna(subset=[\"column_spread_ms\"])\n",
+ "df_col_spread = df.filter(pl.col(\"blob_count\") > 0).drop_nulls(subset=[\"column_spread_ms\"])\n",
"if len(df_col_spread) > 0:\n",
" fig = px.box(\n",
- " df_col_spread,\n",
+ " df_col_spread.to_pandas(),\n",
" x=\"blob_count\",\n",
" y=\"column_spread_ms\",\n",
" color=\"block_type\",\n",
@@ -230,11 +251,13 @@
"metadata": {},
"outputs": [],
"source": [
- "df_delay = df[df[\"blob_count\"] > 0].dropna(subset=[\"block_first_seen_ms\", \"first_column_first_seen_ms\"])\n",
- "df_delay[\"block_to_column_ms\"] = df_delay[\"first_column_first_seen_ms\"] - df_delay[\"block_first_seen_ms\"]\n",
+ "df_delay = df.filter(pl.col(\"blob_count\") > 0).drop_nulls(subset=[\"block_first_seen_ms\", \"first_column_first_seen_ms\"])\n",
+ "df_delay = df_delay.with_columns(\n",
+ " (pl.col(\"first_column_first_seen_ms\") - pl.col(\"block_first_seen_ms\")).alias(\"block_to_column_ms\"),\n",
+ ")\n",
"if len(df_delay) > 0:\n",
" fig = px.box(\n",
- " df_delay,\n",
+ " df_delay.to_pandas(),\n",
" x=\"blob_count\",\n",
" y=\"block_to_column_ms\",\n",
" color=\"block_type\",\n",
diff --git a/notebooks/07-propagation-anomalies.ipynb b/notebooks/07-propagation-anomalies.ipynb
index 243e59e..557de69 100644
--- a/notebooks/07-propagation-anomalies.ipynb
+++ b/notebooks/07-propagation-anomalies.ipynb
@@ -20,6 +20,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "import polars as pl\n",
"import numpy as np\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
@@ -52,15 +53,19 @@
"metadata": {},
"outputs": [],
"source": [
- "df = load_parquet(\"block_production_timeline\", target_date)\n",
+ "df = pl.from_pandas(load_parquet(\"block_production_timeline\", target_date))\n",
"\n",
"# Filter to valid blocks (exclude missed slots)\n",
- "df = df[df[\"block_first_seen_ms\"].notna()]\n",
- "df = df[(df[\"block_first_seen_ms\"] >= 0) & (df[\"block_first_seen_ms\"] < 60000)]\n",
+ "df = df.filter(pl.col(\"block_first_seen_ms\").is_not_null())\n",
+ "df = df.filter((pl.col(\"block_first_seen_ms\") >= 0) & (pl.col(\"block_first_seen_ms\") < 60000))\n",
"\n",
"# Flag MEV vs local blocks\n",
- "df[\"has_mev\"] = df[\"winning_bid_value\"].notna()\n",
- "df[\"block_type\"] = df[\"has_mev\"].map({True: \"MEV\", False: \"Local\"})\n",
+ "df = df.with_columns([\n",
+ " pl.col(\"winning_bid_value\").is_not_null().alias(\"has_mev\"),\n",
+ "])\n",
+ "df = df.with_columns([\n",
+ " pl.when(pl.col(\"has_mev\")).then(pl.lit(\"MEV\")).otherwise(pl.lit(\"Local\")).alias(\"block_type\"),\n",
+ "])\n",
"\n",
"# Get max blob count for charts\n",
"max_blobs = df[\"blob_count\"].max()\n",
@@ -95,33 +100,43 @@
"outputs": [],
"source": [
"# Conditional outliers: blocks slow relative to their blob count\n",
- "df_anomaly = df.copy()\n",
+ "df_anomaly = df.clone()\n",
"\n",
- "# Fit regression: block_first_seen_ms ~ blob_count\n",
- "slope, intercept, r_value, p_value, std_err = stats.linregress(\n",
- " df_anomaly[\"blob_count\"].astype(float), df_anomaly[\"block_first_seen_ms\"]\n",
- ")\n",
+ "# Fit regression: block_first_seen_ms ~ blob_count (need numpy arrays)\n",
+ "blob_count_arr = df_anomaly[\"blob_count\"].cast(pl.Float64).to_numpy()\n",
+ "block_ms_arr = df_anomaly[\"block_first_seen_ms\"].to_numpy()\n",
+ "\n",
+ "slope, intercept, r_value, p_value, std_err = stats.linregress(blob_count_arr, block_ms_arr)\n",
"\n",
"# Calculate expected value and residual\n",
- "df_anomaly[\"expected_ms\"] = intercept + slope * df_anomaly[\"blob_count\"].astype(float)\n",
- "df_anomaly[\"residual_ms\"] = df_anomaly[\"block_first_seen_ms\"] - df_anomaly[\"expected_ms\"]\n",
+ "df_anomaly = df_anomaly.with_columns([\n",
+ " (pl.lit(intercept) + pl.lit(slope) * pl.col(\"blob_count\").cast(pl.Float64)).alias(\"expected_ms\"),\n",
+ "])\n",
+ "df_anomaly = df_anomaly.with_columns([\n",
+ " (pl.col(\"block_first_seen_ms\") - pl.col(\"expected_ms\")).alias(\"residual_ms\"),\n",
+ "])\n",
"\n",
"# Calculate residual standard deviation\n",
"residual_std = df_anomaly[\"residual_ms\"].std()\n",
"\n",
"# Flag anomalies: residual > 2σ (unexpectedly slow)\n",
- "df_anomaly[\"is_anomaly\"] = df_anomaly[\"residual_ms\"] > 2 * residual_std\n",
+ "df_anomaly = df_anomaly.with_columns([\n",
+ " (pl.col(\"residual_ms\") > 2 * residual_std).alias(\"is_anomaly\"),\n",
+ "])\n",
"\n",
"n_anomalies = df_anomaly[\"is_anomaly\"].sum()\n",
"pct_anomalies = n_anomalies / len(df_anomaly) * 100\n",
"\n",
"# Prepare outliers dataframe\n",
- "df_outliers = df_anomaly[df_anomaly[\"is_anomaly\"]].copy()\n",
- "df_outliers[\"relay\"] = df_outliers[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else \"Local\")\n",
- "df_outliers[\"proposer\"] = df_outliers[\"proposer_entity\"].fillna(\"Unknown\")\n",
- "df_outliers[\"builder\"] = df_outliers[\"winning_builder\"].apply(\n",
- " lambda x: f\"{x[:10]}...\" if pd.notna(x) and x else \"Local\"\n",
- ")\n",
+ "df_outliers = df_anomaly.filter(pl.col(\"is_anomaly\"))\n",
+ "df_outliers = df_outliers.with_columns([\n",
+ " pl.col(\"winning_relays\").list.get(0).fill_null(\"Local\").alias(\"relay\"),\n",
+ " pl.col(\"proposer_entity\").fill_null(\"Unknown\").alias(\"proposer\"),\n",
+ " pl.when(pl.col(\"winning_builder\").is_not_null() & (pl.col(\"winning_builder\") != \"\"))\n",
+ " .then(pl.col(\"winning_builder\").str.slice(0, 10) + pl.lit(\"...\"))\n",
+ " .otherwise(pl.lit(\"Local\"))\n",
+ " .alias(\"builder\"),\n",
+ "])\n",
"\n",
"print(f\"Regression: block_ms = {intercept:.1f} + {slope:.2f} × blob_count (R² = {r_value**2:.3f})\")\n",
"print(f\"Residual σ = {residual_std:.1f}ms\")\n",
@@ -164,13 +179,17 @@
"))\n",
"\n",
"# Normal points (sample to avoid overplotting)\n",
- "df_normal = df_anomaly[~df_anomaly[\"is_anomaly\"]]\n",
+ "df_normal = df_anomaly.filter(~pl.col(\"is_anomaly\"))\n",
"if len(df_normal) > 2000:\n",
- " df_normal = df_normal.sample(2000, random_state=42)\n",
+ " df_normal = df_normal.sample(n=2000, seed=42)\n",
+ "\n",
+ "# Convert to pandas for plotly\n",
+ "df_normal_pd = df_normal.to_pandas()\n",
+ "df_outliers_pd = df_outliers.to_pandas()\n",
"\n",
"fig.add_trace(go.Scatter(\n",
- " x=df_normal[\"blob_count\"],\n",
- " y=df_normal[\"block_first_seen_ms\"],\n",
+ " x=df_normal_pd[\"blob_count\"],\n",
+ " y=df_normal_pd[\"block_first_seen_ms\"],\n",
" mode=\"markers\",\n",
" marker=dict(size=4, color=\"rgba(100,150,200,0.4)\"),\n",
" name=f\"Normal ({len(df_anomaly) - n_anomalies:,})\",\n",
@@ -179,8 +198,8 @@
"\n",
"# Anomaly points\n",
"fig.add_trace(go.Scatter(\n",
- " x=df_outliers[\"blob_count\"],\n",
- " y=df_outliers[\"block_first_seen_ms\"],\n",
+ " x=df_outliers_pd[\"blob_count\"],\n",
+ " y=df_outliers_pd[\"block_first_seen_ms\"],\n",
" mode=\"markers\",\n",
" marker=dict(\n",
" size=7,\n",
@@ -189,9 +208,9 @@
" ),\n",
" name=f\"Anomalies ({n_anomalies:,})\",\n",
" customdata=np.column_stack([\n",
- " df_outliers[\"slot\"],\n",
- " df_outliers[\"residual_ms\"].round(0),\n",
- " df_outliers[\"relay\"],\n",
+ " df_outliers_pd[\"slot\"],\n",
+ " df_outliers_pd[\"residual_ms\"].round(0),\n",
+ " df_outliers_pd[\"relay\"],\n",
" ]),\n",
" hovertemplate=\"Slot %{customdata[0]}
Blobs: %{x}
Actual: %{y:.0f}ms
+%{customdata[1]}ms vs expected
Relay: %{customdata[2]}