diff --git a/notebooks/01-blob-inclusion.ipynb b/notebooks/01-blob-inclusion.ipynb index 68fca04..92201df 100644 --- a/notebooks/01-blob-inclusion.ipynb +++ b/notebooks/01-blob-inclusion.ipynb @@ -20,6 +20,7 @@ "import altair as alt\n", "import numpy as np\n", "import pandas as pd\n", + "import polars as pl\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "\n", @@ -118,13 +119,16 @@ "metadata": {}, "outputs": [], "source": [ - "df_blocks_blob_epoch = load_parquet(\"blocks_blob_epoch\", target_date)\n", + "df_blocks_blob_epoch_pd = load_parquet(\"blocks_blob_epoch\", target_date)\n", + "df_blocks_blob_epoch = pl.from_pandas(df_blocks_blob_epoch_pd)\n", "\n", "# Format blob count as \"XX blobs\" for display (moved from SQL for cleaner queries)\n", - "df_blocks_blob_epoch[\"series\"] = df_blocks_blob_epoch[\"blob_count\"].apply(lambda x: f\"{int(x):02d} blobs\")\n", + "df_blocks_blob_epoch = df_blocks_blob_epoch.with_columns(\n", + " pl.col(\"blob_count\").cast(pl.Int64).map_elements(lambda x: f\"{x:02d} blobs\", return_dtype=pl.Utf8).alias(\"series\")\n", + ")\n", "\n", "chart = (\n", - " alt.Chart(df_blocks_blob_epoch)\n", + " alt.Chart(df_blocks_blob_epoch.to_pandas())\n", " .mark_bar()\n", " .encode(\n", " x=alt.X(\"time:T\"),\n", @@ -177,23 +181,32 @@ "metadata": {}, "outputs": [], "source": [ - "df_blob_popularity = load_parquet(\"blob_popularity\", target_date)\n", + "df_blob_popularity_pd = load_parquet(\"blob_popularity\", target_date)\n", + "df_blob_popularity = pl.from_pandas(df_blob_popularity_pd)\n", "\n", "# Pivot for heatmap\n", - "df_pivot = df_blob_popularity.pivot(index=\"blob_count\", columns=\"time\", values=\"count\").fillna(0)\n", + "df_pivot = df_blob_popularity.pivot(on=\"time\", index=\"blob_count\", values=\"count\").fill_null(0)\n", + "\n", + "# Extract column order (time columns)\n", + "time_cols = [c for c in df_pivot.columns if c != \"blob_count\"]\n", + "blob_counts = df_pivot[\"blob_count\"].to_list()\n", "\n", "# Create epoch lookup for hover data\n", - "epoch_lookup = df_blob_popularity.drop_duplicates(subset=[\"time\"]).set_index(\"time\")[\"epoch\"].to_dict()\n", + "df_epoch_lookup = df_blob_popularity.unique(subset=[\"time\"]).select([\"time\", \"epoch\"])\n", + "epoch_lookup = dict(zip(df_epoch_lookup[\"time\"].to_list(), df_epoch_lookup[\"epoch\"].to_list()))\n", + "\n", + "# Extract z values as numpy array\n", + "z_values = df_pivot.select(time_cols).to_numpy()\n", "\n", "fig = go.Figure(\n", " data=go.Heatmap(\n", - " z=df_pivot.values,\n", - " x=df_pivot.columns,\n", - " y=[str(int(b)) for b in df_pivot.index],\n", + " z=z_values,\n", + " x=time_cols,\n", + " y=[str(int(b)) for b in blob_counts],\n", " colorscale=\"inferno\",\n", " reversescale=False,\n", " colorbar=dict(title=\"Block Count\"),\n", - " customdata=[[epoch_lookup.get(t, \"\") for t in df_pivot.columns] for _ in df_pivot.index],\n", + " customdata=[[epoch_lookup.get(t, \"\") for t in time_cols] for _ in blob_counts],\n", " hovertemplate=\"Epoch Time: %{x}
Epoch: %{customdata}
Blob Count: %{y}
Block Count: %{z}\",\n", " ),\n", ")\n", @@ -237,27 +250,36 @@ "metadata": {}, "outputs": [], "source": [ - "df_slot_in_epoch = load_parquet(\"slot_in_epoch\", target_date)\n", + "df_slot_in_epoch_pd = load_parquet(\"slot_in_epoch\", target_date)\n", + "df_slot_in_epoch = pl.from_pandas(df_slot_in_epoch_pd)\n", "\n", - "df_pivot = df_slot_in_epoch.pivot(index=\"slot_in_epoch\", columns=\"time\", values=\"blob_count\").fillna(0)\n", + "# Pivot for heatmap\n", + "df_pivot = df_slot_in_epoch.pivot(on=\"time\", index=\"slot_in_epoch\", values=\"blob_count\").fill_null(0)\n", + "df_slot_pivot = df_slot_in_epoch.pivot(on=\"time\", index=\"slot_in_epoch\", values=\"slot\").fill_null(0)\n", "\n", - "# Create slot lookup for hover data (slot number for each cell)\n", - "df_slot_pivot = df_slot_in_epoch.pivot(index=\"slot_in_epoch\", columns=\"time\", values=\"slot\").fillna(0)\n", + "# Extract column order (time columns) and index values\n", + "time_cols = [c for c in df_pivot.columns if c != \"slot_in_epoch\"]\n", + "slot_in_epoch_vals = df_pivot[\"slot_in_epoch\"].to_list()\n", "\n", "# Create epoch lookup for hover data\n", - "epoch_lookup = df_slot_in_epoch.drop_duplicates(subset=[\"time\"]).set_index(\"time\")[\"epoch\"].to_dict()\n", + "df_epoch_lookup = df_slot_in_epoch.unique(subset=[\"time\"]).select([\"time\", \"epoch\"])\n", + "epoch_lookup = dict(zip(df_epoch_lookup[\"time\"].to_list(), df_epoch_lookup[\"epoch\"].to_list()))\n", + "\n", + "# Extract z and slot values as numpy arrays\n", + "z_values = df_pivot.select(time_cols).to_numpy()\n", + "slot_values = df_slot_pivot.select(time_cols).to_numpy()\n", "\n", "# Build customdata with [slot, epoch] for each cell\n", "customdata = np.dstack([\n", - " df_slot_pivot.values.T,\n", - " [[epoch_lookup.get(t, \"\") for _ in df_pivot.index] for t in df_pivot.columns]\n", + " slot_values.T,\n", + " [[epoch_lookup.get(t, \"\") for _ in slot_in_epoch_vals] for t in time_cols]\n", "])\n", "\n", "fig = go.Figure(\n", " data=go.Heatmap(\n", - " z=df_pivot.values.T,\n", - " x=[str(int(s)) for s in df_pivot.index],\n", - " y=df_pivot.columns,\n", + " z=z_values.T,\n", + " x=[str(int(s)) for s in slot_in_epoch_vals],\n", + " y=time_cols,\n", " colorscale=\"thermal\",\n", " reversescale=True,\n", " colorbar=dict(\n", @@ -298,30 +320,40 @@ "metadata": {}, "outputs": [], "source": [ - "df_pivot = df_slot_in_epoch.pivot(index=\"slot_in_epoch\", columns=\"time\", values=\"blob_count\").fillna(0)\n", - "df_slot_pivot = df_slot_in_epoch.pivot(index=\"slot_in_epoch\", columns=\"time\", values=\"slot\").fillna(0)\n", - "epoch_lookup = df_slot_in_epoch.drop_duplicates(subset=[\"time\"]).set_index(\"time\")[\"epoch\"].to_dict()\n", + "# Pivot data using polars (reusing df_slot_in_epoch from previous cell)\n", + "df_pivot = df_slot_in_epoch.pivot(on=\"time\", index=\"slot_in_epoch\", values=\"blob_count\").fill_null(0)\n", + "df_slot_pivot = df_slot_in_epoch.pivot(on=\"time\", index=\"slot_in_epoch\", values=\"slot\").fill_null(0)\n", + "\n", + "# Extract column order (time columns) and index values\n", + "time_cols = [c for c in df_pivot.columns if c != \"slot_in_epoch\"]\n", + "slot_in_epoch_vals = df_pivot[\"slot_in_epoch\"].to_list()\n", + "\n", + "# Create epoch lookup for hover data\n", + "df_epoch_lookup = df_slot_in_epoch.unique(subset=[\"time\"]).select([\"time\", \"epoch\"])\n", + "epoch_lookup = dict(zip(df_epoch_lookup[\"time\"].to_list(), df_epoch_lookup[\"epoch\"].to_list()))\n", + "\n", + "# Extract values as numpy arrays and transpose for row-based processing\n", + "z_matrix = df_pivot.select(time_cols).to_numpy().T # shape: (n_times, n_slots)\n", + "slot_matrix = df_slot_pivot.select(time_cols).to_numpy().T # shape: (n_times, n_slots)\n", "\n", "# Parameters\n", "n_columns = 4\n", - "n_rows = len(df_pivot.columns)\n", + "n_rows = len(time_cols)\n", "rows_per_chunk = n_rows // n_columns\n", + "n_slots = len(slot_in_epoch_vals)\n", "\n", "# Reshape: stack chunks horizontally\n", - "chunks = []\n", + "z_chunks = []\n", "slot_chunks = []\n", "for i in range(n_columns):\n", - " chunk = df_pivot.T.iloc[i*rows_per_chunk:(i+1)*rows_per_chunk, :]\n", - " chunk = chunk.reset_index(drop=True)\n", - " chunks.append(chunk)\n", - " \n", - " slot_chunk = df_slot_pivot.T.iloc[i*rows_per_chunk:(i+1)*rows_per_chunk, :]\n", - " slot_chunk = slot_chunk.reset_index(drop=True)\n", - " slot_chunks.append(slot_chunk)\n", + " start_idx = i * rows_per_chunk\n", + " end_idx = (i + 1) * rows_per_chunk\n", + " z_chunks.append(z_matrix[start_idx:end_idx, :])\n", + " slot_chunks.append(slot_matrix[start_idx:end_idx, :])\n", "\n", "# Concatenate horizontally (side by side)\n", - "df_combined = pd.concat(chunks, axis=1, ignore_index=True)\n", - "df_slot_combined = pd.concat(slot_chunks, axis=1, ignore_index=True)\n", + "df_combined = np.hstack(z_chunks)\n", + "df_slot_combined = np.hstack(slot_chunks)\n", "\n", "# Build epoch array matching combined layout\n", "epoch_combined = []\n", @@ -329,28 +361,22 @@ " epoch_row = []\n", " for chunk_idx in range(n_columns):\n", " time_idx = chunk_idx * rows_per_chunk + row_idx\n", - " if time_idx < len(df_pivot.columns):\n", - " time_val = df_pivot.columns[time_idx]\n", - " epoch_row.extend([epoch_lookup.get(time_val, \"\")] * len(df_pivot.index))\n", + " if time_idx < len(time_cols):\n", + " time_val = time_cols[time_idx]\n", + " epoch_row.extend([epoch_lookup.get(time_val, \"\")] * n_slots)\n", " else:\n", - " epoch_row.extend([\"\"] * len(df_pivot.index))\n", + " epoch_row.extend([\"\"] * n_slots)\n", " epoch_combined.append(epoch_row)\n", "\n", - "customdata = np.dstack([df_slot_combined.values, epoch_combined])\n", + "customdata = np.dstack([df_slot_combined, epoch_combined])\n", "\n", - "# Create x-axis labels with dividers\n", - "n_slots = len(df_pivot.index)\n", - "x_labels = list(range(n_slots)) * n_columns\n", - "\n", - "y_labels = []\n", - "for row_idx in range(rows_per_chunk):\n", - " time_val = df_pivot.columns[row_idx]\n", - " y_labels.append(str(time_val))\n", + "# Create y-axis labels\n", + "y_labels = [str(time_cols[row_idx]) for row_idx in range(rows_per_chunk)]\n", "\n", "fig = go.Figure(\n", " data=go.Heatmap(\n", - " z=df_combined.values,\n", - " x=list(range(len(df_combined.columns))),\n", + " z=df_combined,\n", + " x=list(range(df_combined.shape[1])),\n", " y=y_labels,\n", " colorscale=\"thermal\",\n", " reversescale=True,\n", @@ -381,8 +407,8 @@ " yaxis_title=\"Epoch\",\n", " yaxis=dict(autorange=\"reversed\"),\n", " xaxis=dict(\n", - " tickvals=list(range(len(df_combined.columns))),\n", - " ticktext=[str(i % n_slots) for i in range(len(df_combined.columns))],\n", + " tickvals=list(range(df_combined.shape[1])),\n", + " ticktext=[str(i % n_slots) for i in range(df_combined.shape[1])],\n", " tickangle=90,\n", " tickfont=dict(size=6),\n", " ),\n", diff --git a/notebooks/02-blob-flow.ipynb b/notebooks/02-blob-flow.ipynb index 69522f5..5612b4a 100644 --- a/notebooks/02-blob-flow.ipynb +++ b/notebooks/02-blob-flow.ipynb @@ -18,6 +18,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "import polars as pl\n", "import plotly.graph_objects as go\n", "import plotly.colors as pc\n", "\n", @@ -93,15 +94,17 @@ "outputs": [], "source": [ "# Load blob flow data\n", - "df_blob_flow = load_parquet(\"blob_flow\", target_date)\n", + "df_blob_flow = pl.from_pandas(load_parquet(\"blob_flow\", target_date))\n", "\n", "# Fill missing values\n", - "df_blob_flow[\"proposer_entity\"] = df_blob_flow[\"proposer_entity\"].fillna(\"Unknown\")\n", - "df_blob_flow[\"winning_relay\"] = df_blob_flow[\"winning_relay\"].fillna(\"Local/Unknown\")\n", + "df_blob_flow = df_blob_flow.with_columns([\n", + " pl.col(\"proposer_entity\").fill_null(\"Unknown\"),\n", + " pl.col(\"winning_relay\").fill_null(\"Local/Unknown\"),\n", + "])\n", "\n", "print(f\"Total blocks: {len(df_blob_flow)}\")\n", - "print(f\"Unique proposer entities: {df_blob_flow['proposer_entity'].nunique()}\")\n", - "print(f\"Unique relays: {df_blob_flow['winning_relay'].nunique()}\")" + "print(f\"Unique proposer entities: {df_blob_flow['proposer_entity'].n_unique()}\")\n", + "print(f\"Unique relays: {df_blob_flow['winning_relay'].n_unique()}\")" ] }, { @@ -120,24 +123,40 @@ "outputs": [], "source": [ "# Calculate block counts per entity\n", - "entity_block_counts = df_blob_flow.groupby(\"proposer_entity\").size()\n", + "entity_block_counts = df_blob_flow.group_by(\"proposer_entity\").len()\n", "\n", "# Get entities that meet the threshold\n", - "valid_entities = entity_block_counts[entity_block_counts >= MIN_BLOCKS].index\n", + "valid_entities = (\n", + " entity_block_counts\n", + " .filter(pl.col(\"len\") >= MIN_BLOCKS)\n", + " .get_column(\"proposer_entity\")\n", + " .to_list()\n", + ")\n", "\n", "# Filter the dataframe\n", - "df_filtered = df_blob_flow[df_blob_flow[\"proposer_entity\"].isin(valid_entities)]\n", + "df_filtered = df_blob_flow.filter(pl.col(\"proposer_entity\").is_in(valid_entities))\n", "\n", "entity_blob_flow = (\n", - " df_filtered.groupby([\"proposer_entity\", \"blob_count\"])\n", - " .size()\n", - " .reset_index(name=\"block_count\")\n", + " df_filtered\n", + " .group_by([\"proposer_entity\", \"blob_count\"])\n", + " .len()\n", + " .rename({\"len\": \"block_count\"})\n", ")\n", "\n", "# Sort entities by total block count (descending)\n", - "entity_totals = entity_blob_flow.groupby(\"proposer_entity\")[\"block_count\"].sum()\n", - "entities = entity_totals.sort_values(ascending=False).index.tolist()\n", - "blob_counts = sorted(entity_blob_flow[\"blob_count\"].unique(), reverse=True) # Descending\n", + "entity_totals = (\n", + " entity_blob_flow\n", + " .group_by(\"proposer_entity\")\n", + " .agg(pl.col(\"block_count\").sum())\n", + " .sort(\"block_count\", descending=True)\n", + ")\n", + "entities = entity_totals.get_column(\"proposer_entity\").to_list()\n", + "entity_totals_dict = dict(zip(\n", + " entity_totals.get_column(\"proposer_entity\").to_list(),\n", + " entity_totals.get_column(\"block_count\").to_list()\n", + "))\n", + "\n", + "blob_counts = sorted(entity_blob_flow.get_column(\"blob_count\").unique().to_list(), reverse=True)\n", "\n", "# Create node labels: entities + blob counts (blob counts sorted descending)\n", "entity_nodes = [f\"E:{e}\" for e in entities]\n", @@ -148,8 +167,11 @@ "node_map = {name: idx for idx, name in enumerate(all_nodes)}\n", "\n", "# Calculate node weights (total flow through each node)\n", - "entity_weights = [entity_totals[e] for e in entities]\n", - "blob_totals = entity_blob_flow.groupby(\"blob_count\")[\"block_count\"].sum()\n", + "entity_weights = [entity_totals_dict[e] for e in entities]\n", + "blob_totals = dict(zip(\n", + " entity_blob_flow.group_by(\"blob_count\").agg(pl.col(\"block_count\").sum()).get_column(\"blob_count\").to_list(),\n", + " entity_blob_flow.group_by(\"blob_count\").agg(pl.col(\"block_count\").sum()).get_column(\"block_count\").to_list()\n", + "))\n", "blob_weights = [blob_totals.get(bc, 0) for bc in blob_counts]\n", "\n", "n_entities = len(entity_nodes)\n", @@ -176,7 +198,7 @@ "values = []\n", "link_labels = []\n", "\n", - "for _, row in entity_blob_flow.iterrows():\n", + "for row in entity_blob_flow.iter_rows(named=True):\n", " e_node = f\"E:{row['proposer_entity']}\"\n", " bc_node = f\"{int(row['blob_count'])} blobs\"\n", " if e_node in node_map and bc_node in node_map:\n", @@ -236,16 +258,26 @@ "outputs": [], "source": [ "relay_blob_flow = (\n", - " df_blob_flow.groupby([\"winning_relay\", \"blob_count\"])\n", - " .size()\n", - " .reset_index(name=\"block_count\")\n", + " df_blob_flow\n", + " .group_by([\"winning_relay\", \"blob_count\"])\n", + " .len()\n", + " .rename({\"len\": \"block_count\"})\n", ")\n", "\n", "# Sort relays by total block count (descending)\n", - "relay_totals = relay_blob_flow.groupby(\"winning_relay\")[\"block_count\"].sum()\n", - "relays = relay_totals.sort_values(ascending=False).index.tolist()\n", + "relay_totals_df = (\n", + " relay_blob_flow\n", + " .group_by(\"winning_relay\")\n", + " .agg(pl.col(\"block_count\").sum())\n", + " .sort(\"block_count\", descending=True)\n", + ")\n", + "relays = relay_totals_df.get_column(\"winning_relay\").to_list()\n", + "relay_totals = dict(zip(\n", + " relay_totals_df.get_column(\"winning_relay\").to_list(),\n", + " relay_totals_df.get_column(\"block_count\").to_list()\n", + "))\n", "\n", - "blob_counts = sorted(relay_blob_flow[\"blob_count\"].unique(), reverse=True) # Descending\n", + "blob_counts = sorted(relay_blob_flow.get_column(\"blob_count\").unique().to_list(), reverse=True)\n", "\n", "# Create node labels: relays + blob counts (blob counts sorted descending)\n", "relay_nodes = [f\"R:{r}\" for r in relays]\n", @@ -257,7 +289,11 @@ "\n", "# Calculate node weights\n", "relay_weights = [relay_totals[r] for r in relays]\n", - "blob_totals = relay_blob_flow.groupby(\"blob_count\")[\"block_count\"].sum()\n", + "blob_totals_df = relay_blob_flow.group_by(\"blob_count\").agg(pl.col(\"block_count\").sum())\n", + "blob_totals = dict(zip(\n", + " blob_totals_df.get_column(\"blob_count\").to_list(),\n", + " blob_totals_df.get_column(\"block_count\").to_list()\n", + "))\n", "blob_weights = [blob_totals.get(bc, 0) for bc in blob_counts]\n", "\n", "n_relays = len(relay_nodes)\n", @@ -284,7 +320,7 @@ "values = []\n", "link_labels = []\n", "\n", - "for _, row in relay_blob_flow.iterrows():\n", + "for row in relay_blob_flow.iter_rows(named=True):\n", " r_node = f\"R:{row['winning_relay']}\"\n", " bc_node = f\"{int(row['blob_count'])} blobs\"\n", " if r_node in node_map and bc_node in node_map:\n", @@ -343,25 +379,49 @@ "outputs": [], "source": [ "# Calculate block counts per entity\n", - "entity_block_counts = df_blob_flow.groupby(\"proposer_entity\").size()\n", - "valid_entities = entity_block_counts[entity_block_counts >= MIN_BLOCKS].index\n", + "entity_block_counts = df_blob_flow.group_by(\"proposer_entity\").len()\n", + "valid_entities = (\n", + " entity_block_counts\n", + " .filter(pl.col(\"len\") >= MIN_BLOCKS)\n", + " .get_column(\"proposer_entity\")\n", + " .to_list()\n", + ")\n", "\n", "# Filter the dataframe\n", - "df_filtered = df_blob_flow[df_blob_flow[\"proposer_entity\"].isin(valid_entities)]\n", + "df_filtered = df_blob_flow.filter(pl.col(\"proposer_entity\").is_in(valid_entities))\n", "\n", "proposer_relay_flow = (\n", - " df_filtered.groupby([\"proposer_entity\", \"winning_relay\"])\n", - " .size()\n", - " .reset_index(name=\"block_count\")\n", + " df_filtered\n", + " .group_by([\"proposer_entity\", \"winning_relay\"])\n", + " .len()\n", + " .rename({\"len\": \"block_count\"})\n", ")\n", "\n", "# Sort entities by total block count (descending)\n", - "entity_totals = proposer_relay_flow.groupby(\"proposer_entity\")[\"block_count\"].sum()\n", - "entities = entity_totals.sort_values(ascending=False).index.tolist()\n", + "entity_totals_df = (\n", + " proposer_relay_flow\n", + " .group_by(\"proposer_entity\")\n", + " .agg(pl.col(\"block_count\").sum())\n", + " .sort(\"block_count\", descending=True)\n", + ")\n", + "entities = entity_totals_df.get_column(\"proposer_entity\").to_list()\n", + "entity_totals = dict(zip(\n", + " entity_totals_df.get_column(\"proposer_entity\").to_list(),\n", + " entity_totals_df.get_column(\"block_count\").to_list()\n", + "))\n", "\n", "# Sort relays by total block count (descending)\n", - "relay_totals = proposer_relay_flow.groupby(\"winning_relay\")[\"block_count\"].sum()\n", - "relays = relay_totals.sort_values(ascending=False).index.tolist()\n", + "relay_totals_df = (\n", + " proposer_relay_flow\n", + " .group_by(\"winning_relay\")\n", + " .agg(pl.col(\"block_count\").sum())\n", + " .sort(\"block_count\", descending=True)\n", + ")\n", + "relays = relay_totals_df.get_column(\"winning_relay\").to_list()\n", + "relay_totals = dict(zip(\n", + " relay_totals_df.get_column(\"winning_relay\").to_list(),\n", + " relay_totals_df.get_column(\"block_count\").to_list()\n", + "))\n", "\n", "# Create node labels: entities + relays\n", "entity_nodes = [f\"E:{e}\" for e in entities]\n", @@ -393,7 +453,7 @@ "values = []\n", "link_labels = []\n", "\n", - "for _, row in proposer_relay_flow.iterrows():\n", + "for row in proposer_relay_flow.iter_rows(named=True):\n", " e_node = f\"E:{row['proposer_entity']}\"\n", " r_node = f\"R:{row['winning_relay']}\"\n", " if e_node in node_map and r_node in node_map:\n", @@ -453,35 +513,60 @@ "outputs": [], "source": [ "# Calculate block counts per entity\n", - "entity_block_counts = df_blob_flow.groupby(\"proposer_entity\").size()\n", - "valid_entities = entity_block_counts[entity_block_counts >= MIN_BLOCKS].index\n", + "entity_block_counts = df_blob_flow.group_by(\"proposer_entity\").len()\n", + "valid_entities = (\n", + " entity_block_counts\n", + " .filter(pl.col(\"len\") >= MIN_BLOCKS)\n", + " .get_column(\"proposer_entity\")\n", + " .to_list()\n", + ")\n", "\n", "# Filter the dataframe\n", - "df_filtered = df_blob_flow[df_blob_flow[\"proposer_entity\"].isin(valid_entities)]\n", + "df_filtered = df_blob_flow.filter(pl.col(\"proposer_entity\").is_in(valid_entities))\n", "\n", "# Aggregate flows: entity -> relay\n", "entity_relay_flow = (\n", - " df_filtered.groupby([\"proposer_entity\", \"winning_relay\"])\n", - " .size()\n", - " .reset_index(name=\"block_count\")\n", + " df_filtered\n", + " .group_by([\"proposer_entity\", \"winning_relay\"])\n", + " .len()\n", + " .rename({\"len\": \"block_count\"})\n", ")\n", "\n", "# Aggregate flows: relay -> blob_count\n", "relay_blob_flow = (\n", - " df_filtered.groupby([\"winning_relay\", \"blob_count\"])\n", - " .size()\n", - " .reset_index(name=\"block_count\")\n", + " df_filtered\n", + " .group_by([\"winning_relay\", \"blob_count\"])\n", + " .len()\n", + " .rename({\"len\": \"block_count\"})\n", ")\n", "\n", "# Sort entities by total block count (descending)\n", - "entity_totals = entity_relay_flow.groupby(\"proposer_entity\")[\"block_count\"].sum()\n", - "entities = entity_totals.sort_values(ascending=False).index.tolist()\n", + "entity_totals_df = (\n", + " entity_relay_flow\n", + " .group_by(\"proposer_entity\")\n", + " .agg(pl.col(\"block_count\").sum())\n", + " .sort(\"block_count\", descending=True)\n", + ")\n", + "entities = entity_totals_df.get_column(\"proposer_entity\").to_list()\n", + "entity_totals = dict(zip(\n", + " entity_totals_df.get_column(\"proposer_entity\").to_list(),\n", + " entity_totals_df.get_column(\"block_count\").to_list()\n", + "))\n", "\n", "# Sort relays by total block count (descending)\n", - "relay_totals = relay_blob_flow.groupby(\"winning_relay\")[\"block_count\"].sum()\n", - "relays = relay_totals.sort_values(ascending=False).index.tolist()\n", + "relay_totals_df = (\n", + " relay_blob_flow\n", + " .group_by(\"winning_relay\")\n", + " .agg(pl.col(\"block_count\").sum())\n", + " .sort(\"block_count\", descending=True)\n", + ")\n", + "relays = relay_totals_df.get_column(\"winning_relay\").to_list()\n", + "relay_totals = dict(zip(\n", + " relay_totals_df.get_column(\"winning_relay\").to_list(),\n", + " relay_totals_df.get_column(\"block_count\").to_list()\n", + "))\n", "\n", - "blob_counts = sorted(df_filtered[\"blob_count\"].unique(), reverse=True) # Descending\n", + "blob_counts = sorted(df_filtered.get_column(\"blob_count\").unique().to_list(), reverse=True)\n", "\n", "# Create node labels: entities + relays + blob counts\n", "entity_nodes = [f\"E:{e}\" for e in entities]\n", @@ -499,7 +584,11 @@ "# Calculate node weights\n", "entity_weights = [entity_totals[e] for e in entities]\n", "relay_weights = [relay_totals[r] for r in relays]\n", - "blob_totals = relay_blob_flow.groupby(\"blob_count\")[\"block_count\"].sum()\n", + "blob_totals_df = relay_blob_flow.group_by(\"blob_count\").agg(pl.col(\"block_count\").sum())\n", + "blob_totals = dict(zip(\n", + " blob_totals_df.get_column(\"blob_count\").to_list(),\n", + " blob_totals_df.get_column(\"block_count\").to_list()\n", + "))\n", "blob_weights = [blob_totals.get(bc, 0) for bc in blob_counts]\n", "\n", "entity_colors = [pc.qualitative.Plotly[i % len(pc.qualitative.Plotly)] for i in range(n_entities)]\n", @@ -525,7 +614,7 @@ "link_labels = []\n", "\n", "# Entity -> Relay links\n", - "for _, row in entity_relay_flow.iterrows():\n", + "for row in entity_relay_flow.iter_rows(named=True):\n", " e_node = f\"E:{row['proposer_entity']}\"\n", " r_node = f\"R:{row['winning_relay']}\"\n", " if e_node in node_map and r_node in node_map:\n", @@ -535,7 +624,7 @@ " link_labels.append(f\"Entity: {row['proposer_entity']}
Relay: {row['winning_relay']}
Block Count: {row['block_count']}\")\n", "\n", "# Relay -> Blob count links\n", - "for _, row in relay_blob_flow.iterrows():\n", + "for row in relay_blob_flow.iter_rows(named=True):\n", " r_node = f\"R:{row['winning_relay']}\"\n", " bc_node = f\"{int(row['blob_count'])} blobs\"\n", " if r_node in node_map and bc_node in node_map:\n", diff --git a/notebooks/03-column-propagation.ipynb b/notebooks/03-column-propagation.ipynb index 71f35d5..fd1049f 100644 --- a/notebooks/03-column-propagation.ipynb +++ b/notebooks/03-column-propagation.ipynb @@ -20,6 +20,8 @@ "outputs": [], "source": [ "import numpy as np\n", + "import pandas as pd\n", + "import polars as pl\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "\n", @@ -55,7 +57,7 @@ "outputs": [], "source": [ "# Load column propagation data\n", - "df_col_first_seen = load_parquet(\"col_first_seen\", target_date)\n", + "df_col_first_seen = pl.from_pandas(load_parquet(\"col_first_seen\", target_date))\n", "\n", "print(f\"Slots with column data: {len(df_col_first_seen)}\")" ] @@ -81,11 +83,14 @@ "\n", "# Reshape for heatmap: rows = columns (c0-c127), columns = time\n", "col_names = [f\"c{i}\" for i in range(NUM_COLUMNS)]\n", - "df_cols = df_col_first_seen[col_names].T\n", - "df_cols.columns = df_col_first_seen[\"time\"]\n", + "\n", + "# Convert to pandas for reshaping (plotly needs pandas)\n", + "df_pd = df_col_first_seen.select([\"time\", \"slot\"] + col_names).to_pandas()\n", + "df_cols = df_pd[col_names].T\n", + "df_cols.columns = df_pd[\"time\"]\n", "\n", "# Create slot lookup for hover data\n", - "slot_values = df_col_first_seen[\"slot\"].values\n", + "slot_values = df_pd[\"slot\"].values\n", "\n", "# Build customdata: slot number for each column in the heatmap\n", "customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])\n", @@ -129,19 +134,21 @@ "source": [ "# Compute delta from min value per slot for each column\n", "col_names = [f\"c{i}\" for i in range(NUM_COLUMNS)]\n", - "df_delta = df_col_first_seen.copy()\n", "\n", - "# Calculate row-wise minimum and subtract from each column\n", - "row_mins = df_delta[col_names].min(axis=1)\n", - "for col in col_names:\n", - " df_delta[col] = df_delta[col] - row_mins\n", + "# Calculate row-wise minimum using polars horizontal operations\n", + "df_delta = df_col_first_seen.with_columns(\n", + " pl.min_horizontal(*col_names).alias(\"row_min\")\n", + ").with_columns(\n", + " [(pl.col(col) - pl.col(\"row_min\")).alias(col) for col in col_names]\n", + ").drop(\"row_min\")\n", "\n", - "# Reshape for heatmap\n", - "df_delta_cols = df_delta[col_names].T\n", - "df_delta_cols.columns = df_delta[\"time\"]\n", + "# Convert to pandas for reshaping (plotly needs pandas)\n", + "df_delta_pd = df_delta.select([\"time\", \"slot\"] + col_names).to_pandas()\n", + "df_delta_cols = df_delta_pd[col_names].T\n", + "df_delta_cols.columns = df_delta_pd[\"time\"]\n", "\n", "# Create slot lookup for hover data\n", - "slot_values = df_delta[\"slot\"].values\n", + "slot_values = df_delta_pd[\"slot\"].values\n", "\n", "# Build customdata: slot number for each column in the heatmap\n", "customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])\n", @@ -188,22 +195,28 @@ "source": [ "# Normalize delta values to 0-1 range per slot\n", "col_names = [f\"c{i}\" for i in range(NUM_COLUMNS)]\n", - "df_normalized = df_col_first_seen.copy()\n", - "\n", - "# Calculate row-wise min and max, then normalize\n", - "row_mins = df_normalized[col_names].min(axis=1)\n", - "row_maxs = df_normalized[col_names].max(axis=1)\n", - "row_ranges = row_maxs - row_mins\n", "\n", - "for col in col_names:\n", - " df_normalized[col] = (df_normalized[col] - row_mins) / row_ranges.replace(0, np.nan)\n", + "# Calculate row-wise min, max, and range using polars horizontal operations\n", + "df_normalized = df_col_first_seen.with_columns([\n", + " pl.min_horizontal(*col_names).alias(\"row_min\"),\n", + " pl.max_horizontal(*col_names).alias(\"row_max\"),\n", + "]).with_columns(\n", + " (pl.col(\"row_max\") - pl.col(\"row_min\")).alias(\"row_range\")\n", + ").with_columns([\n", + " pl.when(pl.col(\"row_range\") == 0)\n", + " .then(None)\n", + " .otherwise((pl.col(col) - pl.col(\"row_min\")) / pl.col(\"row_range\"))\n", + " .alias(col)\n", + " for col in col_names\n", + "]).drop([\"row_min\", \"row_max\", \"row_range\"])\n", "\n", - "# Reshape for heatmap\n", - "df_norm_cols = df_normalized[col_names].T\n", - "df_norm_cols.columns = df_normalized[\"time\"]\n", + "# Convert to pandas for reshaping (plotly needs pandas)\n", + "df_norm_pd = df_normalized.select([\"time\", \"slot\"] + col_names).to_pandas()\n", + "df_norm_cols = df_norm_pd[col_names].T\n", + "df_norm_cols.columns = df_norm_pd[\"time\"]\n", "\n", "# Create slot lookup for hover data\n", - "slot_values = df_normalized[\"slot\"].values\n", + "slot_values = df_norm_pd[\"slot\"].values\n", "\n", "# Build customdata: slot number for each column in the heatmap\n", "customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])\n", @@ -250,11 +263,16 @@ "source": [ "# Compute column spread (max - min across all columns per slot)\n", "col_names = [f\"c{i}\" for i in range(NUM_COLUMNS)]\n", - "df_spread = df_col_first_seen.copy()\n", - "df_spread[\"column_spread_ms\"] = df_spread[col_names].max(axis=1) - df_spread[col_names].min(axis=1)\n", + "\n", + "df_spread = df_col_first_seen.with_columns(\n", + " (pl.max_horizontal(*col_names) - pl.min_horizontal(*col_names)).alias(\"column_spread_ms\")\n", + ")\n", + "\n", + "# Convert to pandas for plotly\n", + "df_spread_pd = df_spread.select([\"time\", \"slot\", \"column_spread_ms\"]).to_pandas()\n", "\n", "fig = px.histogram(\n", - " df_spread,\n", + " df_spread_pd,\n", " x=\"column_spread_ms\",\n", " nbins=60,\n", " color_discrete_sequence=[\"#EF553B\"],\n", @@ -275,14 +293,22 @@ "metadata": {}, "outputs": [], "source": [ - "# Summary statistics\n", - "stats = df_spread[\"column_spread_ms\"].describe(percentiles=[0.5, 0.9, 0.95, 0.99])\n", + "# Summary statistics using polars\n", + "stats = df_spread.select(\"column_spread_ms\").to_series().describe()\n", + "percentiles = df_spread.select(\n", + " pl.col(\"column_spread_ms\").quantile(0.5).alias(\"p50\"),\n", + " pl.col(\"column_spread_ms\").quantile(0.9).alias(\"p90\"),\n", + " pl.col(\"column_spread_ms\").quantile(0.95).alias(\"p95\"),\n", + " pl.col(\"column_spread_ms\").quantile(0.99).alias(\"p99\"),\n", + " pl.col(\"column_spread_ms\").max().alias(\"max\"),\n", + ").row(0)\n", + "\n", "print(\"Column spread (ms):\")\n", - "print(f\" Median: {stats['50%']:.0f}\")\n", - "print(f\" P90: {stats['90%']:.0f}\")\n", - "print(f\" P95: {stats['95%']:.0f}\")\n", - "print(f\" P99: {stats['99%']:.0f}\")\n", - "print(f\" Max: {stats['max']:.0f}\")" + "print(f\" Median: {percentiles[0]:.0f}\")\n", + "print(f\" P90: {percentiles[1]:.0f}\")\n", + "print(f\" P95: {percentiles[2]:.0f}\")\n", + "print(f\" P99: {percentiles[3]:.0f}\")\n", + "print(f\" Max: {percentiles[4]:.0f}\")" ] }, { @@ -303,7 +329,7 @@ "outputs": [], "source": [ "fig = px.scatter(\n", - " df_spread,\n", + " df_spread_pd,\n", " x=\"time\",\n", " y=\"column_spread_ms\",\n", " opacity=0.5,\n", @@ -337,19 +363,28 @@ "# Missing columns heatmap - shows gaps in network coverage\n", "col_names = [f\"c{i}\" for i in range(NUM_COLUMNS)]\n", "\n", - "# Create boolean mask: True (1) where column is missing (NaN)\n", - "df_missing = df_col_first_seen[col_names].isna().astype(int).T\n", - "df_missing.columns = df_col_first_seen[\"time\"]\n", + "# Count missing data using polars\n", + "total_missing = df_col_first_seen.select([\n", + " pl.col(col).is_null().sum() for col in col_names\n", + "]).sum_horizontal().item()\n", + "\n", + "slots_with_missing = df_col_first_seen.select(\n", + " pl.any_horizontal([pl.col(col).is_null() for col in col_names])\n", + ").sum().item()\n", "\n", - "# Count missing data\n", - "total_missing = df_col_first_seen[col_names].isna().sum().sum()\n", - "slots_with_missing = (df_col_first_seen[col_names].isna().any(axis=1)).sum()\n", "print(f\"Total missing column observations: {total_missing:,}\")\n", "print(f\"Slots with at least one missing column: {slots_with_missing:,} ({slots_with_missing/len(df_col_first_seen)*100:.1f}%)\")\n", "\n", "if total_missing > 0:\n", + " # Convert to pandas for the heatmap visualization\n", + " df_pd = df_col_first_seen.select([\"time\", \"slot\"] + col_names).to_pandas()\n", + " \n", + " # Create boolean mask: True (1) where column is missing (NaN)\n", + " df_missing = df_pd[col_names].isna().astype(int).T\n", + " df_missing.columns = df_pd[\"time\"]\n", + " \n", " # Create slot lookup for hover data\n", - " slot_values = df_col_first_seen[\"slot\"].values\n", + " slot_values = df_pd[\"slot\"].values\n", " customdata = np.array([[slot_values[j] for j in range(len(slot_values))] for _ in range(NUM_COLUMNS)])\n", "\n", " fig = go.Figure(\n", diff --git a/notebooks/04-mempool-visibility.ipynb b/notebooks/04-mempool-visibility.ipynb index 7d3dd99..240a6ed 100644 --- a/notebooks/04-mempool-visibility.ipynb +++ b/notebooks/04-mempool-visibility.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "0", "metadata": {}, "source": [ "Analysis of transaction visibility in the public mempool before block inclusion on Ethereum mainnet.\n", @@ -12,6 +13,7 @@ { "cell_type": "code", "execution_count": null, + "id": "1", "metadata": { "tags": [ "parameters" @@ -20,6 +22,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "import polars as pl\n", "import numpy as np\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", @@ -57,6 +60,7 @@ { "cell_type": "code", "execution_count": null, + "id": "2", "metadata": { "tags": [ "sql-fold" @@ -70,22 +74,30 @@ { "cell_type": "code", "execution_count": null, + "id": "3", "metadata": {}, "outputs": [], "source": [ - "df = load_parquet(\"mempool_availability\", target_date)\n", - "df[\"tx_type_label\"] = df[\"tx_type\"].map(TX_TYPE_LABELS)\n", - "df[\"coverage_pct\"] = df[\"seen_before_slot\"] / df[\"total_txs\"] * 100\n", + "df = pl.from_pandas(load_parquet(\"mempool_availability\", target_date))\n", "\n", - "# Calculate never seen (truly private)\n", - "df[\"never_seen\"] = df[\"total_txs\"] - df[\"seen_before_slot\"] - df[\"seen_after_slot\"]\n", + "df = df.with_columns(\n", + " pl.col(\"tx_type\").replace_strict(TX_TYPE_LABELS, default=None).alias(\"tx_type_label\"),\n", + " (pl.col(\"seen_before_slot\") / pl.col(\"total_txs\") * 100).alias(\"coverage_pct\"),\n", + " (pl.col(\"total_txs\") - pl.col(\"seen_before_slot\") - pl.col(\"seen_after_slot\")).alias(\"never_seen\"),\n", + ")\n", "\n", "# Extract p50 age from percentiles array (index 0)\n", - "df[\"p50_age_ms\"] = df[\"age_percentiles_ms\"].apply(lambda x: x[0] if x is not None and len(x) > 0 else np.nan)\n", - "df[\"p50_age_s\"] = df[\"p50_age_ms\"] / 1000\n", + "df = df.with_columns(\n", + " pl.col(\"age_percentiles_ms\").list.get(0).alias(\"p50_age_ms\"),\n", + ")\n", + "df = df.with_columns(\n", + " (pl.col(\"p50_age_ms\") / 1000).alias(\"p50_age_s\"),\n", + ")\n", "\n", "# Add hour column for time-series aggregation\n", - "df[\"hour\"] = df[\"slot_start_date_time\"].dt.floor(\"h\")\n", + "df = df.with_columns(\n", + " pl.col(\"slot_start_date_time\").dt.truncate(\"1h\").alias(\"hour\"),\n", + ")\n", "\n", "total = df[\"total_txs\"].sum()\n", "before = df[\"seen_before_slot\"].sum()\n", @@ -93,7 +105,7 @@ "never = total - before - after\n", "\n", "print(f\"Loaded {len(df):,} slot/type rows\")\n", - "print(f\"Slots: {df['slot'].nunique():,}\")\n", + "print(f\"Slots: {df['slot'].n_unique():,}\")\n", "print(f\"Total transactions: {total:,}\")\n", "print(f\" Seen before slot: {before:,} ({100*before/total:.1f}%)\")\n", "print(f\" Seen after slot: {after:,} ({100*after/total:.1f}%)\")\n", @@ -102,6 +114,7 @@ }, { "cell_type": "markdown", + "id": "4", "metadata": {}, "source": [ "## Coverage by transaction type\n", @@ -112,59 +125,70 @@ { "cell_type": "code", "execution_count": null, + "id": "5", "metadata": {}, "outputs": [], "source": [ "# Aggregate by type\n", - "df_summary = df.groupby([\"tx_type\", \"tx_type_label\"]).agg({\n", - " \"total_txs\": \"sum\",\n", - " \"seen_before_slot\": \"sum\",\n", - " \"seen_after_slot\": \"sum\",\n", - "}).reset_index()\n", - "df_summary[\"never_seen\"] = df_summary[\"total_txs\"] - df_summary[\"seen_before_slot\"] - df_summary[\"seen_after_slot\"]\n", - "df_summary[\"before_pct\"] = df_summary[\"seen_before_slot\"] / df_summary[\"total_txs\"] * 100\n", - "df_summary[\"after_pct\"] = df_summary[\"seen_after_slot\"] / df_summary[\"total_txs\"] * 100\n", - "df_summary[\"never_pct\"] = df_summary[\"never_seen\"] / df_summary[\"total_txs\"] * 100\n", + "df_summary = df.group_by([\"tx_type\", \"tx_type_label\"]).agg(\n", + " pl.col(\"total_txs\").sum(),\n", + " pl.col(\"seen_before_slot\").sum(),\n", + " pl.col(\"seen_after_slot\").sum(),\n", + ")\n", + "df_summary = df_summary.with_columns(\n", + " (pl.col(\"total_txs\") - pl.col(\"seen_before_slot\") - pl.col(\"seen_after_slot\")).alias(\"never_seen\"),\n", + " (pl.col(\"seen_before_slot\") / pl.col(\"total_txs\") * 100).alias(\"before_pct\"),\n", + " (pl.col(\"seen_after_slot\") / pl.col(\"total_txs\") * 100).alias(\"after_pct\"),\n", + ")\n", + "df_summary = df_summary.with_columns(\n", + " (pl.col(\"never_seen\") / pl.col(\"total_txs\") * 100).alias(\"never_pct\"),\n", + ")\n", "\n", "# Display summary table\n", - "summary_display = df_summary[[\"tx_type_label\", \"total_txs\", \"before_pct\", \"after_pct\", \"never_pct\"]].copy()\n", - "summary_display.columns = [\"Type\", \"Total\", \"Before slot %\", \"After slot %\", \"Never seen %\"]\n", - "for col in summary_display.columns[2:]:\n", - " summary_display[col] = summary_display[col].round(1)\n", - "summary_display" + "summary_display = df_summary.select(\n", + " pl.col(\"tx_type_label\").alias(\"Type\"),\n", + " pl.col(\"total_txs\").alias(\"Total\"),\n", + " pl.col(\"before_pct\").round(1).alias(\"Before slot %\"),\n", + " pl.col(\"after_pct\").round(1).alias(\"After slot %\"),\n", + " pl.col(\"never_pct\").round(1).alias(\"Never seen %\"),\n", + ")\n", + "summary_display.to_pandas()" ] }, { "cell_type": "code", "execution_count": null, + "id": "6", "metadata": {}, "outputs": [], "source": [ "# Coverage stacked bar chart showing before/after/never breakdown\n", + "df_summary_pd = df_summary.to_pandas()\n", + "\n", "fig = go.Figure()\n", "\n", "fig.add_trace(go.Bar(\n", - " x=df_summary[\"tx_type_label\"],\n", - " y=df_summary[\"before_pct\"],\n", + " x=df_summary_pd[\"tx_type_label\"],\n", + " y=df_summary_pd[\"before_pct\"],\n", " name=\"Before slot (public)\",\n", " marker_color=\"#27ae60\",\n", - " text=df_summary[\"before_pct\"].round(1),\n", + " text=df_summary_pd[\"before_pct\"].round(1),\n", " textposition=\"inside\",\n", "))\n", "fig.add_trace(go.Bar(\n", - " x=df_summary[\"tx_type_label\"],\n", - " y=df_summary[\"after_pct\"],\n", + " x=df_summary_pd[\"tx_type_label\"],\n", + " y=df_summary_pd[\"after_pct\"],\n", " name=\"After slot (propagated)\",\n", " marker_color=\"#3498db\",\n", - " text=df_summary[\"after_pct\"].round(1),\n", + " text=df_summary_pd[\"after_pct\"].round(1),\n", " textposition=\"inside\",\n", "))\n", "fig.add_trace(go.Bar(\n", - " x=df_summary[\"tx_type_label\"],\n", - " y=df_summary[\"never_pct\"],\n", + " x=df_summary_pd[\"tx_type_label\"],\n", + " y=df_summary_pd[\"never_pct\"],\n", " name=\"Never seen (private)\",\n", " marker_color=\"#95a5a6\",\n", - " text=df_summary[\"never_pct\"].round(1),\n", + " text=df_summary_pd[\"never_pct\"].round(1),\n", " textposition=\"inside\",\n", "))\n", "\n", @@ -182,6 +206,7 @@ }, { "cell_type": "markdown", + "id": "7", "metadata": {}, "source": [ "## Hourly coverage trends\n", @@ -192,19 +217,24 @@ { "cell_type": "code", "execution_count": null, + "id": "8", "metadata": {}, "outputs": [], "source": [ "# Aggregate to hourly for time-series\n", - "df_hourly = df.groupby([\"hour\", \"tx_type\", \"tx_type_label\"]).agg({\n", - " \"total_txs\": \"sum\",\n", - " \"seen_before_slot\": \"sum\",\n", - " \"seen_after_slot\": \"sum\",\n", - "}).reset_index()\n", - "df_hourly[\"coverage_pct\"] = df_hourly[\"seen_before_slot\"] / df_hourly[\"total_txs\"] * 100\n", + "df_hourly = df.group_by([\"hour\", \"tx_type\", \"tx_type_label\"]).agg(\n", + " pl.col(\"total_txs\").sum(),\n", + " pl.col(\"seen_before_slot\").sum(),\n", + " pl.col(\"seen_after_slot\").sum(),\n", + ")\n", + "df_hourly = df_hourly.with_columns(\n", + " (pl.col(\"seen_before_slot\") / pl.col(\"total_txs\") * 100).alias(\"coverage_pct\"),\n", + ")\n", + "\n", + "df_hourly_pd = df_hourly.to_pandas()\n", "\n", "fig = px.line(\n", - " df_hourly,\n", + " df_hourly_pd,\n", " x=\"hour\",\n", " y=\"coverage_pct\",\n", " color=\"tx_type_label\",\n", @@ -222,6 +252,7 @@ }, { "cell_type": "markdown", + "id": "9", "metadata": {}, "source": [ "## Transaction volume over time\n", @@ -232,33 +263,38 @@ { "cell_type": "code", "execution_count": null, + "id": "10", "metadata": {}, "outputs": [], "source": [ "# Aggregate across types by hour - 3-way breakdown\n", - "df_volume = df.groupby(\"hour\").agg({\n", - " \"total_txs\": \"sum\",\n", - " \"seen_before_slot\": \"sum\",\n", - " \"seen_after_slot\": \"sum\",\n", - "}).reset_index()\n", - "df_volume[\"never_seen\"] = df_volume[\"total_txs\"] - df_volume[\"seen_before_slot\"] - df_volume[\"seen_after_slot\"]\n", + "df_volume = df.group_by(\"hour\").agg(\n", + " pl.col(\"total_txs\").sum(),\n", + " pl.col(\"seen_before_slot\").sum(),\n", + " pl.col(\"seen_after_slot\").sum(),\n", + ")\n", + "df_volume = df_volume.with_columns(\n", + " (pl.col(\"total_txs\") - pl.col(\"seen_before_slot\") - pl.col(\"seen_after_slot\")).alias(\"never_seen\"),\n", + ")\n", + "\n", + "df_volume_pd = df_volume.to_pandas()\n", "\n", "fig = go.Figure()\n", "fig.add_trace(go.Bar(\n", - " x=df_volume[\"hour\"],\n", - " y=df_volume[\"seen_before_slot\"],\n", + " x=df_volume_pd[\"hour\"],\n", + " y=df_volume_pd[\"seen_before_slot\"],\n", " name=\"Before slot (public)\",\n", " marker_color=\"#27ae60\",\n", "))\n", "fig.add_trace(go.Bar(\n", - " x=df_volume[\"hour\"],\n", - " y=df_volume[\"seen_after_slot\"],\n", + " x=df_volume_pd[\"hour\"],\n", + " y=df_volume_pd[\"seen_after_slot\"],\n", " name=\"After slot (propagated)\",\n", " marker_color=\"#3498db\",\n", "))\n", "fig.add_trace(go.Bar(\n", - " x=df_volume[\"hour\"],\n", - " y=df_volume[\"never_seen\"],\n", + " x=df_volume_pd[\"hour\"],\n", + " y=df_volume_pd[\"never_seen\"],\n", " name=\"Never seen (private)\",\n", " marker_color=\"#95a5a6\",\n", "))\n", @@ -275,6 +311,7 @@ }, { "cell_type": "markdown", + "id": "11", "metadata": {}, "source": [ "## Coverage heatmap\n", @@ -285,17 +322,30 @@ { "cell_type": "code", "execution_count": null, + "id": "12", "metadata": {}, "outputs": [], "source": [ "# Pivot for heatmap using hourly aggregated data\n", - "df_pivot = df_hourly.pivot(index=\"tx_type_label\", columns=\"hour\", values=\"coverage_pct\").fillna(0)\n", + "df_pivot = df_hourly.pivot(\n", + " on=\"hour\",\n", + " index=\"tx_type_label\",\n", + " values=\"coverage_pct\",\n", + ").fill_null(0)\n", + "\n", + "# Get column order (all columns except tx_type_label, sorted)\n", + "value_cols = [c for c in df_pivot.columns if c != \"tx_type_label\"]\n", + "value_cols_sorted = sorted(value_cols)\n", + "\n", + "# Extract data for heatmap\n", + "z_values = df_pivot.select(value_cols_sorted).to_numpy()\n", + "y_labels = df_pivot[\"tx_type_label\"].to_list()\n", "\n", "fig = go.Figure(\n", " data=go.Heatmap(\n", - " z=df_pivot.values,\n", - " x=df_pivot.columns,\n", - " y=df_pivot.index,\n", + " z=z_values,\n", + " x=value_cols_sorted,\n", + " y=y_labels,\n", " colorscale=\"Greens\",\n", " colorbar=dict(title=dict(text=\"Coverage %\", side=\"right\")),\n", " )\n", @@ -311,6 +361,7 @@ }, { "cell_type": "markdown", + "id": "13", "metadata": {}, "source": [ "## Mempool age distribution\n", @@ -321,62 +372,64 @@ { "cell_type": "code", "execution_count": null, + "id": "14", "metadata": {}, "outputs": [], "source": [ - "# Extract all percentiles for each type\n", - "def extract_percentiles(group):\n", - " # Collect all non-null percentile arrays, weighted by seen_before_slot count\n", - " pct_arrays = []\n", - " for _, row in group.iterrows():\n", - " if row['seen_before_slot'] > 0 and row['age_percentiles_ms'] is not None:\n", - " pcts = row['age_percentiles_ms']\n", - " if not any(np.isnan(pcts)):\n", - " pct_arrays.append(pcts)\n", - " \n", - " if not pct_arrays:\n", - " return pd.Series({'p50': np.nan, 'p75': np.nan, 'p80': np.nan, 'p85': np.nan, 'p90': np.nan, 'p95': np.nan, 'p99': np.nan})\n", - " \n", - " # Average percentiles across slots (simple mean for now)\n", - " avg_pcts = np.nanmean(pct_arrays, axis=0)\n", - " return pd.Series({\n", - " 'p50': avg_pcts[0] / 1000,\n", - " 'p75': avg_pcts[1] / 1000,\n", - " 'p80': avg_pcts[2] / 1000,\n", - " 'p85': avg_pcts[3] / 1000,\n", - " 'p90': avg_pcts[4] / 1000,\n", - " 'p95': avg_pcts[5] / 1000,\n", - " 'p99': avg_pcts[6] / 1000,\n", - " })\n", - "\n", - "df_age = df.groupby(['tx_type', 'tx_type_label']).apply(extract_percentiles, include_groups=False).reset_index()\n", + "# Extract percentiles by tx_type using polars aggregation\n", + "# Filter to rows with valid data, then compute mean percentiles per type\n", + "df_with_pcts = df.filter(\n", + " (pl.col(\"seen_before_slot\") > 0) & \n", + " pl.col(\"age_percentiles_ms\").is_not_null() &\n", + " (pl.col(\"age_percentiles_ms\").list.len() >= 7)\n", + ")\n", + "\n", + "# Extract individual percentiles and compute mean per tx_type\n", + "df_age = df_with_pcts.group_by([\"tx_type\", \"tx_type_label\"]).agg(\n", + " (pl.col(\"age_percentiles_ms\").list.get(0).mean() / 1000).alias(\"p50\"),\n", + " (pl.col(\"age_percentiles_ms\").list.get(1).mean() / 1000).alias(\"p75\"),\n", + " (pl.col(\"age_percentiles_ms\").list.get(2).mean() / 1000).alias(\"p80\"),\n", + " (pl.col(\"age_percentiles_ms\").list.get(3).mean() / 1000).alias(\"p85\"),\n", + " (pl.col(\"age_percentiles_ms\").list.get(4).mean() / 1000).alias(\"p90\"),\n", + " (pl.col(\"age_percentiles_ms\").list.get(5).mean() / 1000).alias(\"p95\"),\n", + " (pl.col(\"age_percentiles_ms\").list.get(6).mean() / 1000).alias(\"p99\"),\n", + ")\n", "\n", "# Display age table\n", - "age_display = df_age[['tx_type_label', 'p50', 'p75', 'p90', 'p95', 'p99']].copy()\n", - "age_display.columns = ['Type', 'p50 (s)', 'p75 (s)', 'p90 (s)', 'p95 (s)', 'p99 (s)']\n", - "for col in age_display.columns[1:]:\n", - " age_display[col] = age_display[col].round(1)\n", - "age_display" + "age_display = df_age.select(\n", + " pl.col(\"tx_type_label\").alias(\"Type\"),\n", + " pl.col(\"p50\").round(1).alias(\"p50 (s)\"),\n", + " pl.col(\"p75\").round(1).alias(\"p75 (s)\"),\n", + " pl.col(\"p90\").round(1).alias(\"p90 (s)\"),\n", + " pl.col(\"p95\").round(1).alias(\"p95 (s)\"),\n", + " pl.col(\"p99\").round(1).alias(\"p99 (s)\"),\n", + ")\n", + "age_display.to_pandas()" ] }, { "cell_type": "code", "execution_count": null, + "id": "15", "metadata": {}, "outputs": [], "source": [ "# Visualize age percentiles as line chart\n", - "df_age_long = df_age.melt(\n", - " id_vars=['tx_type', 'tx_type_label'],\n", - " value_vars=['p50', 'p75', 'p80', 'p85', 'p90', 'p95', 'p99'],\n", - " var_name='percentile',\n", - " value_name='age_s'\n", + "df_age_long = df_age.unpivot(\n", + " index=[\"tx_type\", \"tx_type_label\"],\n", + " on=[\"p50\", \"p75\", \"p80\", \"p85\", \"p90\", \"p95\", \"p99\"],\n", + " variable_name=\"percentile\",\n", + " value_name=\"age_s\",\n", ")\n", "# Convert percentile labels to numeric for x-axis\n", - "df_age_long['pct_num'] = df_age_long['percentile'].str.replace('p', '').astype(int)\n", + "df_age_long = df_age_long.with_columns(\n", + " pl.col(\"percentile\").str.replace(\"p\", \"\").cast(pl.Int64).alias(\"pct_num\"),\n", + ")\n", + "\n", + "df_age_long_pd = df_age_long.to_pandas()\n", "\n", "fig = px.line(\n", - " df_age_long,\n", + " df_age_long_pd,\n", " x='pct_num',\n", " y='age_s',\n", " color='tx_type_label',\n", @@ -397,28 +450,37 @@ { "cell_type": "code", "execution_count": null, + "id": "16", "metadata": {}, "outputs": [], "source": [ "# Aggregate histogram buckets across all slots per tx type\n", "hist_cols = [f'age_hist_{i}' for i in range(15)]\n", - "df_hist = df.groupby(['tx_type', 'tx_type_label'])[hist_cols].sum().reset_index()\n", + "df_hist = df.group_by([\"tx_type\", \"tx_type_label\"]).agg(\n", + " [pl.col(c).sum() for c in hist_cols]\n", + ")\n", "\n", "# Melt to long format for plotting\n", - "df_hist_long = df_hist.melt(\n", - " id_vars=['tx_type', 'tx_type_label'],\n", - " value_vars=hist_cols,\n", - " var_name='bucket',\n", - " value_name='count'\n", + "df_hist_long = df_hist.unpivot(\n", + " index=[\"tx_type\", \"tx_type_label\"],\n", + " on=hist_cols,\n", + " variable_name=\"bucket\",\n", + " value_name=\"count\",\n", + ")\n", + "df_hist_long = df_hist_long.with_columns(\n", + " pl.col(\"bucket\").str.extract(r\"(\\d+)\").cast(pl.Int64).alias(\"bucket_idx\"),\n", + ")\n", + "df_hist_long = df_hist_long.with_columns(\n", + " pl.col(\"bucket_idx\").replace_strict(dict(enumerate(HIST_LABELS)), default=None).alias(\"bucket_label\"),\n", ")\n", - "df_hist_long['bucket_idx'] = df_hist_long['bucket'].str.extract(r'(\\d+)').astype(int)\n", - "df_hist_long['bucket_label'] = df_hist_long['bucket_idx'].map(dict(enumerate(HIST_LABELS)))\n", "\n", "# Sort by bucket index for proper ordering\n", - "df_hist_long = df_hist_long.sort_values(['tx_type', 'bucket_idx'])\n", + "df_hist_long = df_hist_long.sort([\"tx_type\", \"bucket_idx\"])\n", + "\n", + "df_hist_long_pd = df_hist_long.to_pandas()\n", "\n", "fig = px.bar(\n", - " df_hist_long,\n", + " df_hist_long_pd,\n", " x='bucket_label',\n", " y='count',\n", " color='tx_type_label',\n", @@ -441,6 +503,7 @@ }, { "cell_type": "markdown", + "id": "17", "metadata": {}, "source": [ "## Propagation delay (seen after slot)\n", @@ -451,60 +514,62 @@ { "cell_type": "code", "execution_count": null, + "id": "18", "metadata": {}, "outputs": [], "source": [ "# Extract delay percentiles for transactions seen AFTER slot start\n", - "def extract_delay_percentiles(group):\n", - " pct_arrays = []\n", - " for _, row in group.iterrows():\n", - " if row['seen_after_slot'] > 0 and row['delay_percentiles_ms'] is not None:\n", - " pcts = row['delay_percentiles_ms']\n", - " if not any(np.isnan(pcts)):\n", - " pct_arrays.append(pcts)\n", - " \n", - " if not pct_arrays:\n", - " return pd.Series({'p50': np.nan, 'p75': np.nan, 'p80': np.nan, 'p85': np.nan, 'p90': np.nan, 'p95': np.nan, 'p99': np.nan})\n", - " \n", - " avg_pcts = np.nanmean(pct_arrays, axis=0)\n", - " return pd.Series({\n", - " 'p50': avg_pcts[0] / 1000,\n", - " 'p75': avg_pcts[1] / 1000,\n", - " 'p80': avg_pcts[2] / 1000,\n", - " 'p85': avg_pcts[3] / 1000,\n", - " 'p90': avg_pcts[4] / 1000,\n", - " 'p95': avg_pcts[5] / 1000,\n", - " 'p99': avg_pcts[6] / 1000,\n", - " })\n", - "\n", - "df_delay = df.groupby(['tx_type', 'tx_type_label']).apply(extract_delay_percentiles, include_groups=False).reset_index()\n", + "df_with_delay = df.filter(\n", + " (pl.col(\"seen_after_slot\") > 0) & \n", + " pl.col(\"delay_percentiles_ms\").is_not_null() &\n", + " (pl.col(\"delay_percentiles_ms\").list.len() >= 7)\n", + ")\n", + "\n", + "df_delay = df_with_delay.group_by([\"tx_type\", \"tx_type_label\"]).agg(\n", + " (pl.col(\"delay_percentiles_ms\").list.get(0).mean() / 1000).alias(\"p50\"),\n", + " (pl.col(\"delay_percentiles_ms\").list.get(1).mean() / 1000).alias(\"p75\"),\n", + " (pl.col(\"delay_percentiles_ms\").list.get(2).mean() / 1000).alias(\"p80\"),\n", + " (pl.col(\"delay_percentiles_ms\").list.get(3).mean() / 1000).alias(\"p85\"),\n", + " (pl.col(\"delay_percentiles_ms\").list.get(4).mean() / 1000).alias(\"p90\"),\n", + " (pl.col(\"delay_percentiles_ms\").list.get(5).mean() / 1000).alias(\"p95\"),\n", + " (pl.col(\"delay_percentiles_ms\").list.get(6).mean() / 1000).alias(\"p99\"),\n", + ")\n", "\n", "# Display delay table\n", - "delay_display = df_delay[['tx_type_label', 'p50', 'p75', 'p90', 'p95', 'p99']].copy()\n", - "delay_display.columns = ['Type', 'p50 (s)', 'p75 (s)', 'p90 (s)', 'p95 (s)', 'p99 (s)']\n", - "for col in delay_display.columns[1:]:\n", - " delay_display[col] = delay_display[col].round(2)\n", - "delay_display" + "delay_display = df_delay.select(\n", + " pl.col(\"tx_type_label\").alias(\"Type\"),\n", + " pl.col(\"p50\").round(2).alias(\"p50 (s)\"),\n", + " pl.col(\"p75\").round(2).alias(\"p75 (s)\"),\n", + " pl.col(\"p90\").round(2).alias(\"p90 (s)\"),\n", + " pl.col(\"p95\").round(2).alias(\"p95 (s)\"),\n", + " pl.col(\"p99\").round(2).alias(\"p99 (s)\"),\n", + ")\n", + "delay_display.to_pandas()" ] }, { "cell_type": "code", "execution_count": null, + "id": "19", "metadata": {}, "outputs": [], "source": [ "# Visualize delay percentiles as line chart\n", - "df_delay_long = df_delay.melt(\n", - " id_vars=['tx_type', 'tx_type_label'],\n", - " value_vars=['p50', 'p75', 'p80', 'p85', 'p90', 'p95', 'p99'],\n", - " var_name='percentile',\n", - " value_name='delay_s'\n", + "df_delay_long = df_delay.unpivot(\n", + " index=[\"tx_type\", \"tx_type_label\"],\n", + " on=[\"p50\", \"p75\", \"p80\", \"p85\", \"p90\", \"p95\", \"p99\"],\n", + " variable_name=\"percentile\",\n", + " value_name=\"delay_s\",\n", ")\n", "# Convert percentile labels to numeric for x-axis\n", - "df_delay_long['pct_num'] = df_delay_long['percentile'].str.replace('p', '').astype(int)\n", + "df_delay_long = df_delay_long.with_columns(\n", + " pl.col(\"percentile\").str.replace(\"p\", \"\").cast(pl.Int64).alias(\"pct_num\"),\n", + ")\n", + "\n", + "df_delay_long_pd = df_delay_long.to_pandas()\n", "\n", "fig = px.line(\n", - " df_delay_long,\n", + " df_delay_long_pd,\n", " x='pct_num',\n", " y='delay_s',\n", " color='tx_type_label',\n", @@ -525,28 +590,37 @@ { "cell_type": "code", "execution_count": null, + "id": "20", "metadata": {}, "outputs": [], "source": [ "# Aggregate delay histogram buckets across all slots per tx type\n", "delay_hist_cols = [f'delay_hist_{i}' for i in range(15)]\n", - "df_delay_hist = df.groupby(['tx_type', 'tx_type_label'])[delay_hist_cols].sum().reset_index()\n", + "df_delay_hist = df.group_by([\"tx_type\", \"tx_type_label\"]).agg(\n", + " [pl.col(c).sum() for c in delay_hist_cols]\n", + ")\n", "\n", "# Melt to long format for plotting\n", - "df_delay_hist_long = df_delay_hist.melt(\n", - " id_vars=['tx_type', 'tx_type_label'],\n", - " value_vars=delay_hist_cols,\n", - " var_name='bucket',\n", - " value_name='count'\n", + "df_delay_hist_long = df_delay_hist.unpivot(\n", + " index=[\"tx_type\", \"tx_type_label\"],\n", + " on=delay_hist_cols,\n", + " variable_name=\"bucket\",\n", + " value_name=\"count\",\n", + ")\n", + "df_delay_hist_long = df_delay_hist_long.with_columns(\n", + " pl.col(\"bucket\").str.extract(r\"(\\d+)\").cast(pl.Int64).alias(\"bucket_idx\"),\n", + ")\n", + "df_delay_hist_long = df_delay_hist_long.with_columns(\n", + " pl.col(\"bucket_idx\").replace_strict(dict(enumerate(HIST_LABELS)), default=None).alias(\"bucket_label\"),\n", ")\n", - "df_delay_hist_long['bucket_idx'] = df_delay_hist_long['bucket'].str.extract(r'(\\d+)').astype(int)\n", - "df_delay_hist_long['bucket_label'] = df_delay_hist_long['bucket_idx'].map(dict(enumerate(HIST_LABELS)))\n", "\n", "# Sort by bucket index for proper ordering\n", - "df_delay_hist_long = df_delay_hist_long.sort_values(['tx_type', 'bucket_idx'])\n", + "df_delay_hist_long = df_delay_hist_long.sort([\"tx_type\", \"bucket_idx\"])\n", + "\n", + "df_delay_hist_long_pd = df_delay_hist_long.to_pandas()\n", "\n", "fig = px.bar(\n", - " df_delay_hist_long,\n", + " df_delay_hist_long_pd,\n", " x='bucket_label',\n", " y='count',\n", " color='tx_type_label',\n", @@ -569,6 +643,7 @@ }, { "cell_type": "markdown", + "id": "21", "metadata": {}, "source": [ "## Sentry coverage\n", @@ -579,6 +654,7 @@ { "cell_type": "code", "execution_count": null, + "id": "22", "metadata": { "tags": [ "sql-fold" @@ -592,16 +668,21 @@ { "cell_type": "code", "execution_count": null, + "id": "23", "metadata": {}, "outputs": [], "source": [ - "df_sentry = load_parquet(\"sentry_coverage\", target_date)\n", + "df_sentry = pl.from_pandas(load_parquet(\"sentry_coverage\", target_date))\n", "\n", "# Shorten sentry names for display\n", - "df_sentry[\"sentry_short\"] = df_sentry[\"sentry\"].str.replace(\"ethpandaops/mainnet/\", \"\")\n", + "df_sentry = df_sentry.with_columns(\n", + " pl.col(\"sentry\").str.replace(\"ethpandaops/mainnet/\", \"\").alias(\"sentry_short\"),\n", + ")\n", + "\n", + "df_sentry_pd = df_sentry.head(15).to_pandas()\n", "\n", "fig = px.bar(\n", - " df_sentry.head(15),\n", + " df_sentry_pd,\n", " x=\"coverage_pct\",\n", " y=\"sentry_short\",\n", " orientation=\"h\",\n", @@ -631,5 +712,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } diff --git a/notebooks/05-mev-pipeline.ipynb b/notebooks/05-mev-pipeline.ipynb index 5f835ec..061598b 100644 --- a/notebooks/05-mev-pipeline.ipynb +++ b/notebooks/05-mev-pipeline.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "0", "metadata": {}, "source": [ "Analysis of MEV pipeline timing and its effect on block propagation on Ethereum mainnet." @@ -11,7 +10,6 @@ { "cell_type": "code", "execution_count": null, - "id": "1", "metadata": { "tags": [ "parameters" @@ -20,6 +18,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "import polars as pl\n", "import numpy as np\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", @@ -34,10 +33,8 @@ { "cell_type": "code", "execution_count": null, - "id": "2", "metadata": { "tags": [ - "hide-input", "sql-fold" ] }, @@ -49,24 +46,29 @@ { "cell_type": "code", "execution_count": null, - "id": "3", "metadata": {}, "outputs": [], "source": [ - "df = load_parquet(\"block_production_timeline\", target_date)\n", + "df = pl.from_pandas(load_parquet(\"block_production_timeline\", target_date))\n", "\n", "# Flag MEV vs local blocks\n", - "df[\"has_mev\"] = df[\"winning_bid_value\"].notna()\n", - "df[\"block_type\"] = df[\"has_mev\"].map({True: \"MEV\", False: \"Local\"})\n", + "df = df.with_columns(\n", + " pl.col(\"winning_bid_value\").is_not_null().alias(\"has_mev\")\n", + ").with_columns(\n", + " pl.when(pl.col(\"has_mev\")).then(pl.lit(\"MEV\")).otherwise(pl.lit(\"Local\")).alias(\"block_type\")\n", + ")\n", + "\n", + "total_slots = len(df)\n", + "mev_count = df.filter(pl.col(\"has_mev\")).height\n", + "local_count = df.filter(~pl.col(\"has_mev\")).height\n", "\n", - "print(f\"Total slots: {len(df):,}\")\n", - "print(f\"MEV blocks: {df['has_mev'].sum():,} ({df['has_mev'].mean()*100:.1f}%)\")\n", - "print(f\"Local blocks: {(~df['has_mev']).sum():,} ({(~df['has_mev']).mean()*100:.1f}%)\")" + "print(f\"Total slots: {total_slots:,}\")\n", + "print(f\"MEV blocks: {mev_count:,} ({mev_count/total_slots*100:.1f}%)\")\n", + "print(f\"Local blocks: {local_count:,} ({local_count/total_slots*100:.1f}%)\")" ] }, { "cell_type": "markdown", - "id": "4", "metadata": {}, "source": [ "## Bid trace coverage\n", @@ -82,27 +84,27 @@ { "cell_type": "code", "execution_count": null, - "id": "5", "metadata": {}, "outputs": [], "source": [ "# Bid trace coverage analysis\n", - "df_trace = df[df[\"has_mev\"]].copy()\n", - "df_trace[\"relay\"] = df_trace[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else \"Unknown\")\n", - "df_trace[\"has_bid_timing\"] = df_trace[\"winning_bid_ms\"].notna()\n", + "df_trace = df.filter(pl.col(\"has_mev\")).with_columns(\n", + " pl.col(\"winning_relays\").list.get(0).fill_null(\"Unknown\").alias(\"relay\"),\n", + " pl.col(\"winning_bid_ms\").is_not_null().alias(\"has_bid_timing\")\n", + ")\n", "\n", "# Aggregate by relay\n", - "relay_coverage = df_trace.groupby(\"relay\").agg(\n", - " total=(\"slot\", \"count\"),\n", - " with_timing=(\"has_bid_timing\", \"sum\"),\n", - ").reset_index()\n", - "relay_coverage[\"without_timing\"] = relay_coverage[\"total\"] - relay_coverage[\"with_timing\"]\n", - "relay_coverage[\"pct_with_timing\"] = (relay_coverage[\"with_timing\"] / relay_coverage[\"total\"] * 100).round(1)\n", - "relay_coverage = relay_coverage.sort_values(\"total\", ascending=True)\n", + "relay_coverage = df_trace.group_by(\"relay\").agg(\n", + " pl.col(\"slot\").count().alias(\"total\"),\n", + " pl.col(\"has_bid_timing\").sum().alias(\"with_timing\"),\n", + ").with_columns(\n", + " (pl.col(\"total\") - pl.col(\"with_timing\")).alias(\"without_timing\"),\n", + " (pl.col(\"with_timing\") / pl.col(\"total\") * 100).round(1).alias(\"pct_with_timing\")\n", + ").sort(\"total\")\n", "\n", "# Summary stats\n", - "total_mev = relay_coverage[\"total\"].sum()\n", - "total_with_timing = relay_coverage[\"with_timing\"].sum()\n", + "total_mev = relay_coverage.select(pl.col(\"total\").sum()).item()\n", + "total_with_timing = relay_coverage.select(pl.col(\"with_timing\").sum()).item()\n", "print(f\"MEV blocks: {total_mev:,}\")\n", "print(f\"With bid timing: {total_with_timing:,} ({total_with_timing/total_mev*100:.1f}%)\")\n", "print(f\"Without bid timing: {total_mev - total_with_timing:,} ({(total_mev - total_with_timing)/total_mev*100:.1f}%)\")" @@ -111,27 +113,28 @@ { "cell_type": "code", "execution_count": null, - "id": "6", "metadata": {}, "outputs": [], "source": [ "# Stacked horizontal bar chart\n", + "relay_coverage_pd = relay_coverage.to_pandas()\n", + "\n", "fig = go.Figure()\n", "\n", "fig.add_trace(go.Bar(\n", - " y=relay_coverage[\"relay\"],\n", - " x=relay_coverage[\"with_timing\"],\n", + " y=relay_coverage_pd[\"relay\"],\n", + " x=relay_coverage_pd[\"with_timing\"],\n", " name=\"With bid timing\",\n", " orientation=\"h\",\n", " marker_color=\"#2ecc71\",\n", - " text=relay_coverage.apply(lambda r: f\"{r['pct_with_timing']:.0f}%\" if r['with_timing'] > 0 else \"\", axis=1),\n", + " text=relay_coverage_pd.apply(lambda r: f\"{r['pct_with_timing']:.0f}%\" if r['with_timing'] > 0 else \"\", axis=1),\n", " textposition=\"inside\",\n", " hovertemplate=\"%{y}
With timing: %{x:,}\",\n", "))\n", "\n", "fig.add_trace(go.Bar(\n", - " y=relay_coverage[\"relay\"],\n", - " x=relay_coverage[\"without_timing\"],\n", + " y=relay_coverage_pd[\"relay\"],\n", + " x=relay_coverage_pd[\"without_timing\"],\n", " name=\"Without bid timing\",\n", " orientation=\"h\",\n", " marker_color=\"#e74c3c\",\n", @@ -151,7 +154,6 @@ }, { "cell_type": "markdown", - "id": "7", "metadata": {}, "source": [ "## MEV pipeline and blob count effects\n", @@ -162,40 +164,38 @@ { "cell_type": "code", "execution_count": null, - "id": "8", "metadata": {}, "outputs": [], "source": [ "# Prepare data for MEV analysis\n", - "df_mev = df.copy()\n", - "\n", "# Filter out missed slots (block never produced - shows as invalid timestamps)\n", "# A valid block_first_seen_ms should be positive and reasonable (< 60 seconds)\n", - "df_mev = df_mev[df_mev[\"block_first_seen_ms\"].notna()]\n", - "df_mev = df_mev[(df_mev[\"block_first_seen_ms\"] >= 0) & (df_mev[\"block_first_seen_ms\"] < 60000)]\n", - "\n", - "# Flag MEV vs local blocks\n", - "df_mev[\"has_mev\"] = df_mev[\"winning_bid_value\"].notna()\n", - "df_mev[\"block_type\"] = df_mev[\"has_mev\"].map({True: \"MEV\", False: \"Local\"})\n", - "\n", - "# Calculate bidding window duration\n", - "df_mev[\"bidding_duration_ms\"] = df_mev[\"last_bid_ms\"] - df_mev[\"first_bid_ms\"]\n", - "\n", - "# Calculate block to column delay (for slots with blobs)\n", - "df_mev[\"block_to_column_ms\"] = df_mev[\"first_column_first_seen_ms\"] - df_mev[\"block_first_seen_ms\"]\n", + "df_mev = df.filter(\n", + " pl.col(\"block_first_seen_ms\").is_not_null() &\n", + " (pl.col(\"block_first_seen_ms\") >= 0) &\n", + " (pl.col(\"block_first_seen_ms\") < 60000)\n", + ")\n", "\n", - "# Calculate bid to block delay (time from winning bid to block appearing)\n", - "df_mev[\"bid_to_block_ms\"] = df_mev[\"block_first_seen_ms\"] - df_mev[\"winning_bid_ms\"]\n", + "# Calculate derived columns\n", + "df_mev = df_mev.with_columns(\n", + " (pl.col(\"last_bid_ms\") - pl.col(\"first_bid_ms\")).alias(\"bidding_duration_ms\"),\n", + " (pl.col(\"first_column_first_seen_ms\") - pl.col(\"block_first_seen_ms\")).alias(\"block_to_column_ms\"),\n", + " (pl.col(\"block_first_seen_ms\") - pl.col(\"winning_bid_ms\")).alias(\"bid_to_block_ms\"),\n", + ")\n", "\n", "# Dynamic blob count bins based on actual data\n", - "max_blobs = df_mev[\"blob_count\"].max()\n", + "max_blobs = df_mev.select(pl.col(\"blob_count\").max()).item()\n", "bin_size = 3\n", "# Create bins: [-1, 0, 3, 6, 9, 12, 15, ...] to match 0, 1-3, 4-6, etc.\n", "bins = [-1, 0] + list(range(bin_size, max_blobs + bin_size, bin_size))\n", "if bins[-1] < max_blobs:\n", " bins.append(((max_blobs // bin_size) + 1) * bin_size)\n", "labels = [\"0\"] + [f\"{bins[i]+1}-{bins[i+1]}\" for i in range(1, len(bins)-1)]\n", - "df_mev[\"blob_bin\"] = pd.cut(df_mev[\"blob_count\"], bins=bins, labels=labels)\n", + "\n", + "# Create blob_bin column using cut\n", + "df_mev = df_mev.with_columns(\n", + " pl.col(\"blob_count\").cut(bins[1:], labels=labels).alias(\"blob_bin\")\n", + ")\n", "BLOB_BIN_ORDER = labels # Store for use in charts\n", "\n", "# Generate Plasma-based discrete colors, truncated to avoid light yellow (poor contrast)\n", @@ -208,18 +208,22 @@ "PLASMA_TRUNCATED = px.colors.sample_colorscale(\"Plasma\", [i/10 * PLASMA_MAX for i in range(11)])\n", "\n", "# MEV-only subset for MEV timing plots\n", - "df_mev_only = df_mev[df_mev[\"has_mev\"]].copy()\n", + "df_mev_only = df_mev.filter(pl.col(\"has_mev\"))\n", + "\n", + "total_df = len(df)\n", + "valid_blocks = len(df_mev)\n", + "mev_blocks = df_mev.filter(pl.col(\"has_mev\")).height\n", + "local_blocks = df_mev.filter(~pl.col(\"has_mev\")).height\n", "\n", - "print(f\"Total slots in data: {len(df):,}\")\n", - "print(f\"Slots with valid blocks: {len(df_mev):,} ({len(df_mev)/len(df)*100:.1f}%)\")\n", - "print(f\"MEV blocks: {df_mev['has_mev'].sum():,} ({df_mev['has_mev'].mean()*100:.1f}%)\")\n", - "print(f\"Local blocks: {(~df_mev['has_mev']).sum():,} ({(~df_mev['has_mev']).mean()*100:.1f}%)\")\n", + "print(f\"Total slots in data: {total_df:,}\")\n", + "print(f\"Slots with valid blocks: {valid_blocks:,} ({valid_blocks/total_df*100:.1f}%)\")\n", + "print(f\"MEV blocks: {mev_blocks:,} ({mev_blocks/valid_blocks*100:.1f}%)\")\n", + "print(f\"Local blocks: {local_blocks:,} ({local_blocks/valid_blocks*100:.1f}%)\")\n", "print(f\"Max blob count: {max_blobs}, bins: {labels}\")" ] }, { "cell_type": "markdown", - "id": "9", "metadata": {}, "source": [ "### Winning bid timing vs block arrival\n", @@ -230,15 +234,17 @@ { "cell_type": "code", "execution_count": null, - "id": "10", "metadata": {}, "outputs": [], "source": [ "if len(df_mev_only) > 0:\n", " # Extract first relay from array for display\n", - " df_plot = df_mev_only.dropna(subset=[\"winning_bid_ms\", \"block_first_seen_ms\"]).copy()\n", - " df_plot[\"relay\"] = df_plot[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else None)\n", - " df_plot[\"blob_count_f\"] = df_plot[\"blob_count\"].astype(float) # Force continuous color\n", + " df_plot = df_mev_only.filter(\n", + " pl.col(\"winning_bid_ms\").is_not_null() & pl.col(\"block_first_seen_ms\").is_not_null()\n", + " ).with_columns(\n", + " pl.col(\"winning_relays\").list.get(0).alias(\"relay\"),\n", + " pl.col(\"blob_count\").cast(pl.Float64).alias(\"blob_count_f\")\n", + " ).to_pandas()\n", "\n", " fig = px.scatter(\n", " df_plot,\n", @@ -264,7 +270,6 @@ }, { "cell_type": "markdown", - "id": "11", "metadata": {}, "source": [ "### Bid to block delay\n", @@ -275,35 +280,37 @@ { "cell_type": "code", "execution_count": null, - "id": "12", "metadata": {}, "outputs": [], "source": [ "# Prepare bid to block delay data\n", - "df_bid_delay = df_mev_only.dropna(subset=[\"bid_to_block_ms\"]).copy()\n", - "\n", - "# Filter to reasonable range (positive delays, < 5 seconds)\n", - "df_bid_delay = df_bid_delay[(df_bid_delay[\"bid_to_block_ms\"] > 0) & (df_bid_delay[\"bid_to_block_ms\"] < 5000)]\n", - "df_bid_delay[\"relay\"] = df_bid_delay[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else None)\n", - "df_bid_delay[\"blob_count_f\"] = df_bid_delay[\"blob_count\"].astype(float)\n", + "df_bid_delay = df_mev_only.filter(\n", + " pl.col(\"bid_to_block_ms\").is_not_null() &\n", + " (pl.col(\"bid_to_block_ms\") > 0) &\n", + " (pl.col(\"bid_to_block_ms\") < 5000)\n", + ").with_columns(\n", + " pl.col(\"winning_relays\").list.get(0).alias(\"relay\"),\n", + " pl.col(\"blob_count\").cast(pl.Float64).alias(\"blob_count_f\")\n", + ")\n", "\n", "# Summary stats\n", - "median_delay = df_bid_delay[\"bid_to_block_ms\"].median()\n", - "p95_delay = df_bid_delay[\"bid_to_block_ms\"].quantile(0.95)\n", + "median_delay = df_bid_delay.select(pl.col(\"bid_to_block_ms\").median()).item()\n", + "p95_delay = df_bid_delay.select(pl.col(\"bid_to_block_ms\").quantile(0.95)).item()\n", "print(f\"Bid to block delay (MEV blocks): median {median_delay:.0f}ms, P95 {p95_delay:.0f}ms, n={len(df_bid_delay):,}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "13", "metadata": {}, "outputs": [], "source": [ "if len(df_bid_delay) > 0:\n", + " df_bid_delay_pd = df_bid_delay.to_pandas()\n", + " \n", " # Scatter plot: bid_to_block_ms vs blob_count\n", " fig = px.scatter(\n", - " df_bid_delay,\n", + " df_bid_delay_pd,\n", " x=\"blob_count\",\n", " y=\"bid_to_block_ms\",\n", " color=\"blob_count_f\",\n", @@ -314,7 +321,10 @@ " )\n", " \n", " # Add median line per blob count\n", - " median_by_blob = df_bid_delay.groupby(\"blob_count\")[\"bid_to_block_ms\"].median().reset_index()\n", + " median_by_blob = df_bid_delay.group_by(\"blob_count\").agg(\n", + " pl.col(\"bid_to_block_ms\").median()\n", + " ).sort(\"blob_count\").to_pandas()\n", + " \n", " fig.add_trace(go.Scatter(\n", " x=median_by_blob[\"blob_count\"],\n", " y=median_by_blob[\"bid_to_block_ms\"],\n", @@ -340,7 +350,6 @@ }, { "cell_type": "markdown", - "id": "14", "metadata": {}, "source": [ "### Winning bid value vs block arrival\n", @@ -351,17 +360,18 @@ { "cell_type": "code", "execution_count": null, - "id": "15", "metadata": {}, "outputs": [], "source": [ "if len(df_mev_only) > 0:\n", - " # Convert winning_bid_value from wei to ETH\n", - " df_plot = df_mev_only.copy()\n", - " df_plot[\"winning_bid_eth\"] = df_plot[\"winning_bid_value\"].astype(float) / 1e18\n", - " df_plot[\"relay\"] = df_plot[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else None)\n", - " df_plot[\"blob_count_f\"] = df_plot[\"blob_count\"].astype(float) # Force continuous color\n", - " df_plot = df_plot[df_plot[\"winning_bid_eth\"] > 0].dropna(subset=[\"block_first_seen_ms\"]) # Filter for log scale\n", + " # Convert winning_bid_value from wei to ETH and filter\n", + " df_plot = df_mev_only.with_columns(\n", + " (pl.col(\"winning_bid_value\").cast(pl.Float64) / 1e18).alias(\"winning_bid_eth\"),\n", + " pl.col(\"winning_relays\").list.get(0).alias(\"relay\"),\n", + " pl.col(\"blob_count\").cast(pl.Float64).alias(\"blob_count_f\")\n", + " ).filter(\n", + " (pl.col(\"winning_bid_eth\") > 0) & pl.col(\"block_first_seen_ms\").is_not_null()\n", + " ).to_pandas()\n", "\n", " fig = px.scatter(\n", " df_plot,\n", @@ -386,7 +396,6 @@ }, { "cell_type": "markdown", - "id": "16", "metadata": {}, "source": [ "### Bidding window duration vs block arrival\n", @@ -397,15 +406,18 @@ { "cell_type": "code", "execution_count": null, - "id": "17", "metadata": {}, "outputs": [], "source": [ - "df_bidding = df_mev_only.dropna(subset=[\"bidding_duration_ms\", \"block_first_seen_ms\"])\n", + "df_bidding = df_mev_only.filter(\n", + " pl.col(\"bidding_duration_ms\").is_not_null() & pl.col(\"block_first_seen_ms\").is_not_null()\n", + ")\n", + "\n", "if len(df_bidding) > 0:\n", - " df_plot = df_bidding.copy()\n", - " df_plot[\"relay\"] = df_plot[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else None)\n", - " df_plot[\"blob_count_f\"] = df_plot[\"blob_count\"].astype(float) # Force continuous color\n", + " df_plot = df_bidding.with_columns(\n", + " pl.col(\"winning_relays\").list.get(0).alias(\"relay\"),\n", + " pl.col(\"blob_count\").cast(pl.Float64).alias(\"blob_count_f\")\n", + " ).to_pandas()\n", "\n", " fig = px.scatter(\n", " df_plot,\n", @@ -429,7 +441,6 @@ }, { "cell_type": "markdown", - "id": "18", "metadata": {}, "source": [ "### Block arrival by relay and blob count\n", @@ -442,19 +453,21 @@ { "cell_type": "code", "execution_count": null, - "id": "19", "metadata": {}, "outputs": [], "source": [ "# Extract first relay from array for analysis\n", - "df_relay = df_mev_only.copy()\n", - "df_relay[\"relay\"] = df_relay[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else None)\n", - "df_relay = df_relay.dropna(subset=[\"relay\"])\n", + "df_relay = df_mev_only.with_columns(\n", + " pl.col(\"winning_relays\").list.get(0).alias(\"relay\")\n", + ").filter(pl.col(\"relay\").is_not_null())\n", "\n", "if len(df_relay) > 0:\n", " # Get top relays by volume\n", - " top_relays = df_relay[\"relay\"].value_counts().head(8).index.tolist()\n", - " df_relay_top = df_relay[df_relay[\"relay\"].isin(top_relays)].copy()\n", + " top_relays = df_relay.group_by(\"relay\").agg(\n", + " pl.count().alias(\"count\")\n", + " ).sort(\"count\", descending=True).head(8).select(\"relay\").to_series().to_list()\n", + " \n", + " df_relay_top = df_relay.filter(pl.col(\"relay\").is_in(top_relays)).to_pandas()\n", " \n", " fig = px.box(\n", " df_relay_top,\n", @@ -476,7 +489,6 @@ }, { "cell_type": "markdown", - "id": "20", "metadata": {}, "source": [ "## MEV vs local block comparison\n", @@ -489,41 +501,39 @@ { "cell_type": "code", "execution_count": null, - "id": "21", "metadata": {}, "outputs": [], "source": [ "# Prepare data for MEV vs Local comparison\n", - "df_compare = df_mev.copy()\n", - "\n", "# Create ordered list from 0 to max_blobs (reversed for bottom-to-top display)\n", "all_blob_counts = list(range(int(max_blobs) + 1))\n", "blob_count_order = [str(b) for b in all_blob_counts]\n", "# Reverse for category_orders (Plotly categorical y-axis goes top-to-bottom by default)\n", "blob_count_order_reversed = blob_count_order[::-1]\n", "\n", - "# Convert blob_count to string for proper categorical ordering\n", - "df_compare[\"blob_count_str\"] = df_compare[\"blob_count\"].astype(str)\n", - "\n", - "# Ensure x-axis is numeric (not datetime)\n", - "df_compare[\"block_first_seen_ms\"] = df_compare[\"block_first_seen_ms\"].astype(float)\n", + "# Convert blob_count to string for proper categorical ordering and ensure numeric x-axis\n", + "df_compare = df_mev.with_columns(\n", + " pl.col(\"blob_count\").cast(pl.Utf8).alias(\"blob_count_str\"),\n", + " pl.col(\"block_first_seen_ms\").cast(pl.Float64)\n", + ")\n", "\n", "# Summary stats\n", - "mev_median = df_compare[df_compare[\"block_type\"] == \"MEV\"][\"block_first_seen_ms\"].median()\n", - "local_median = df_compare[df_compare[\"block_type\"] == \"Local\"][\"block_first_seen_ms\"].median()\n", + "mev_median = df_compare.filter(pl.col(\"block_type\") == \"MEV\").select(pl.col(\"block_first_seen_ms\").median()).item()\n", + "local_median = df_compare.filter(pl.col(\"block_type\") == \"Local\").select(pl.col(\"block_first_seen_ms\").median()).item()\n", "print(f\"Block first seen median - MEV: {mev_median:.0f}ms, Local: {local_median:.0f}ms\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "22", "metadata": {}, "outputs": [], "source": [ "if len(df_compare) > 0:\n", + " df_compare_pd = df_compare.to_pandas()\n", + " \n", " fig = px.box(\n", - " df_compare,\n", + " df_compare_pd,\n", " y=\"blob_count_str\",\n", " x=\"block_first_seen_ms\",\n", " color=\"block_type\",\n", @@ -550,41 +560,39 @@ { "cell_type": "code", "execution_count": null, - "id": "23", "metadata": {}, "outputs": [], "source": [ "# Prepare first column first seen data\n", - "df_col_first = df_mev.dropna(subset=[\"first_column_first_seen_ms\"]).copy()\n", - "df_col_first = df_col_first[df_col_first[\"blob_count\"] > 0] # Only slots with blobs\n", + "df_col_first = df_mev.filter(\n", + " pl.col(\"first_column_first_seen_ms\").is_not_null() & (pl.col(\"blob_count\") > 0)\n", + ").with_columns(\n", + " pl.col(\"blob_count\").cast(pl.Utf8).alias(\"blob_count_str\"),\n", + " pl.col(\"first_column_first_seen_ms\").cast(pl.Float64)\n", + ")\n", "\n", "# Create ordered list from 1 to max_blobs (reversed for bottom-to-top display)\n", "col_blob_counts = list(range(1, int(max_blobs) + 1))\n", "col_blob_count_order = [str(b) for b in col_blob_counts]\n", "col_blob_count_order_reversed = col_blob_count_order[::-1]\n", "\n", - "# Convert blob_count to string for proper categorical ordering\n", - "df_col_first[\"blob_count_str\"] = df_col_first[\"blob_count\"].astype(str)\n", - "\n", - "# Ensure x-axis is numeric\n", - "df_col_first[\"first_column_first_seen_ms\"] = df_col_first[\"first_column_first_seen_ms\"].astype(float)\n", - "\n", "# Summary stats\n", - "mev_median = df_col_first[df_col_first[\"block_type\"] == \"MEV\"][\"first_column_first_seen_ms\"].median()\n", - "local_median = df_col_first[df_col_first[\"block_type\"] == \"Local\"][\"first_column_first_seen_ms\"].median()\n", + "mev_median = df_col_first.filter(pl.col(\"block_type\") == \"MEV\").select(pl.col(\"first_column_first_seen_ms\").median()).item()\n", + "local_median = df_col_first.filter(pl.col(\"block_type\") == \"Local\").select(pl.col(\"first_column_first_seen_ms\").median()).item()\n", "print(f\"First column seen median - MEV: {mev_median:.0f}ms, Local: {local_median:.0f}ms\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "24", "metadata": {}, "outputs": [], "source": [ "if len(df_col_first) > 0:\n", + " df_col_first_pd = df_col_first.to_pandas()\n", + " \n", " fig = px.box(\n", - " df_col_first,\n", + " df_col_first_pd,\n", " y=\"blob_count_str\",\n", " x=\"first_column_first_seen_ms\",\n", " color=\"block_type\",\n", @@ -613,36 +621,34 @@ { "cell_type": "code", "execution_count": null, - "id": "25", "metadata": {}, "outputs": [], "source": [ "# Prepare last column first seen data\n", - "df_col_last = df_mev.dropna(subset=[\"last_column_first_seen_ms\"]).copy()\n", - "df_col_last = df_col_last[df_col_last[\"blob_count\"] > 0] # Only slots with blobs\n", - "\n", - "# Convert blob_count to string for proper categorical ordering\n", - "df_col_last[\"blob_count_str\"] = df_col_last[\"blob_count\"].astype(str)\n", - "\n", - "# Ensure x-axis is numeric\n", - "df_col_last[\"last_column_first_seen_ms\"] = df_col_last[\"last_column_first_seen_ms\"].astype(float)\n", + "df_col_last = df_mev.filter(\n", + " pl.col(\"last_column_first_seen_ms\").is_not_null() & (pl.col(\"blob_count\") > 0)\n", + ").with_columns(\n", + " pl.col(\"blob_count\").cast(pl.Utf8).alias(\"blob_count_str\"),\n", + " pl.col(\"last_column_first_seen_ms\").cast(pl.Float64)\n", + ")\n", "\n", "# Summary stats\n", - "mev_median = df_col_last[df_col_last[\"block_type\"] == \"MEV\"][\"last_column_first_seen_ms\"].median()\n", - "local_median = df_col_last[df_col_last[\"block_type\"] == \"Local\"][\"last_column_first_seen_ms\"].median()\n", + "mev_median = df_col_last.filter(pl.col(\"block_type\") == \"MEV\").select(pl.col(\"last_column_first_seen_ms\").median()).item()\n", + "local_median = df_col_last.filter(pl.col(\"block_type\") == \"Local\").select(pl.col(\"last_column_first_seen_ms\").median()).item()\n", "print(f\"Last column seen median - MEV: {mev_median:.0f}ms, Local: {local_median:.0f}ms\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "26", "metadata": {}, "outputs": [], "source": [ "if len(df_col_last) > 0:\n", + " df_col_last_pd = df_col_last.to_pandas()\n", + " \n", " fig = px.box(\n", - " df_col_last,\n", + " df_col_last_pd,\n", " y=\"blob_count_str\",\n", " x=\"last_column_first_seen_ms\",\n", " color=\"block_type\",\n", @@ -670,7 +676,6 @@ }, { "cell_type": "markdown", - "id": "27", "metadata": {}, "source": [ "### Block propagation by builder\n", @@ -681,50 +686,60 @@ { "cell_type": "code", "execution_count": null, - "id": "28", "metadata": {}, "outputs": [], "source": [ "# Prepare builder data for heatmap\n", - "df_builder = df_mev_only.dropna(subset=[\"winning_builder\", \"block_first_seen_ms\"]).copy()\n", - "\n", - "# Shorten builder pubkeys for display (first 8 + last 4 chars)\n", - "df_builder[\"builder_short\"] = df_builder[\"winning_builder\"].apply(\n", - " lambda x: f\"{x[:10]}...{x[-4:]}\" if len(x) > 14 else x\n", + "df_builder = df_mev_only.filter(\n", + " pl.col(\"winning_builder\").is_not_null() & pl.col(\"block_first_seen_ms\").is_not_null()\n", + ").with_columns(\n", + " # Shorten builder pubkeys for display (first 10 + last 4 chars)\n", + " pl.when(pl.col(\"winning_builder\").str.len_chars() > 14)\n", + " .then(pl.col(\"winning_builder\").str.slice(0, 10) + pl.lit(\"...\") + pl.col(\"winning_builder\").str.slice(-4))\n", + " .otherwise(pl.col(\"winning_builder\"))\n", + " .alias(\"builder_short\")\n", ")\n", "\n", "# Get top builders by block count, sorted descending\n", - "builder_counts = df_builder[\"builder_short\"].value_counts()\n", - "top_builders = builder_counts.head(9).index.tolist()\n", + "builder_counts = df_builder.group_by(\"builder_short\").agg(\n", + " pl.count().alias(\"count\")\n", + ").sort(\"count\", descending=True)\n", + "\n", + "top_builders = builder_counts.head(9).select(\"builder_short\").to_series().to_list()\n", "builder_order = top_builders # Already sorted by count descending\n", - "df_builder_top = df_builder[df_builder[\"builder_short\"].isin(top_builders)].copy()\n", + "df_builder_top = df_builder.filter(pl.col(\"builder_short\").is_in(top_builders))\n", "\n", "# Print builder stats\n", "print(\"Top builders by block count:\")\n", + "builder_counts_dict = dict(zip(\n", + " builder_counts.select(\"builder_short\").to_series().to_list(),\n", + " builder_counts.select(\"count\").to_series().to_list()\n", + "))\n", "for builder in builder_order:\n", - " count = builder_counts[builder]\n", - " median_ms = df_builder_top[df_builder_top[\"builder_short\"] == builder][\"block_first_seen_ms\"].median()\n", + " count = builder_counts_dict[builder]\n", + " median_ms = df_builder_top.filter(pl.col(\"builder_short\") == builder).select(pl.col(\"block_first_seen_ms\").median()).item()\n", " print(f\" {builder}: {count:,} blocks, median {median_ms:.0f}ms\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "29", "metadata": {}, "outputs": [], "source": [ "if len(df_builder_top) > 0:\n", + " df_builder_top_pd = df_builder_top.to_pandas()\n", + " \n", " n_builders = len(top_builders)\n", " n_cols = 3\n", " n_rows = (n_builders + n_cols - 1) // n_cols\n", " \n", " # Create 100ms bins for x-axis (block timing)\n", - " x_max = df_builder_top[\"block_first_seen_ms\"].quantile(0.99) # Trim outliers\n", + " x_max = df_builder_top.select(pl.col(\"block_first_seen_ms\").quantile(0.99)).item()\n", " x_bins = int(x_max // 100) + 1\n", " \n", " fig = px.density_heatmap(\n", - " df_builder_top,\n", + " df_builder_top_pd,\n", " x=\"block_first_seen_ms\",\n", " y=\"blob_count\",\n", " facet_col=\"builder_short\",\n", @@ -747,7 +762,7 @@ " # Clean up facet titles - add block count\n", " for ann in fig.layout.annotations:\n", " builder = ann.text.replace(\"builder_short=\", \"\")\n", - " count = builder_counts.get(builder, 0)\n", + " count = builder_counts_dict.get(builder, 0)\n", " ann.update(text=f\"{builder}
({count:,} blocks)\", font_size=9, yshift=8)\n", " \n", " # Add axis titles and ensure ticks are visible on all panels\n", @@ -773,7 +788,6 @@ }, { "cell_type": "markdown", - "id": "30", "metadata": {}, "source": [ "### Block propagation by relay\n", @@ -784,47 +798,56 @@ { "cell_type": "code", "execution_count": null, - "id": "31", "metadata": {}, "outputs": [], "source": [ "# Prepare relay data for heatmap\n", - "df_relay_heat = df_mev_only.dropna(subset=[\"block_first_seen_ms\"]).copy()\n", - "df_relay_heat[\"relay\"] = df_relay_heat[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else None)\n", - "df_relay_heat = df_relay_heat.dropna(subset=[\"relay\"])\n", + "df_relay_heat = df_mev_only.filter(\n", + " pl.col(\"block_first_seen_ms\").is_not_null()\n", + ").with_columns(\n", + " pl.col(\"winning_relays\").list.get(0).alias(\"relay\")\n", + ").filter(pl.col(\"relay\").is_not_null())\n", "\n", "# Get top relays by block count, sorted descending\n", - "relay_counts = df_relay_heat[\"relay\"].value_counts()\n", - "top_relays = relay_counts.head(9).index.tolist()\n", + "relay_counts = df_relay_heat.group_by(\"relay\").agg(\n", + " pl.count().alias(\"count\")\n", + ").sort(\"count\", descending=True)\n", + "\n", + "top_relays = relay_counts.head(9).select(\"relay\").to_series().to_list()\n", "relay_order = top_relays # Already sorted by count descending\n", - "df_relay_top = df_relay_heat[df_relay_heat[\"relay\"].isin(top_relays)].copy()\n", + "df_relay_top = df_relay_heat.filter(pl.col(\"relay\").is_in(top_relays))\n", "\n", "# Print relay stats\n", "print(\"Top relays by block count:\")\n", + "relay_counts_dict = dict(zip(\n", + " relay_counts.select(\"relay\").to_series().to_list(),\n", + " relay_counts.select(\"count\").to_series().to_list()\n", + "))\n", "for relay in relay_order:\n", - " count = relay_counts[relay]\n", - " median_ms = df_relay_top[df_relay_top[\"relay\"] == relay][\"block_first_seen_ms\"].median()\n", + " count = relay_counts_dict[relay]\n", + " median_ms = df_relay_top.filter(pl.col(\"relay\") == relay).select(pl.col(\"block_first_seen_ms\").median()).item()\n", " print(f\" {relay}: {count:,} blocks, median {median_ms:.0f}ms\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "32", "metadata": {}, "outputs": [], "source": [ "if len(df_relay_top) > 0:\n", + " df_relay_top_pd = df_relay_top.to_pandas()\n", + " \n", " n_relays = len(top_relays)\n", " n_cols = 3\n", " n_rows = (n_relays + n_cols - 1) // n_cols\n", " \n", " # Create 100ms bins for x-axis (block timing)\n", - " x_max = df_relay_top[\"block_first_seen_ms\"].quantile(0.99) # Trim outliers\n", + " x_max = df_relay_top.select(pl.col(\"block_first_seen_ms\").quantile(0.99)).item()\n", " x_bins = int(x_max // 100) + 1\n", " \n", " fig = px.density_heatmap(\n", - " df_relay_top,\n", + " df_relay_top_pd,\n", " x=\"block_first_seen_ms\",\n", " y=\"blob_count\",\n", " facet_col=\"relay\",\n", @@ -847,7 +870,7 @@ " # Clean up facet titles - add block count\n", " for ann in fig.layout.annotations:\n", " relay = ann.text.replace(\"relay=\", \"\")\n", - " count = relay_counts.get(relay, 0)\n", + " count = relay_counts_dict.get(relay, 0)\n", " ann.update(text=f\"{relay}
({count:,} blocks)\", font_size=9, yshift=8)\n", " \n", " # Add axis titles and ensure ticks are visible on all panels\n", @@ -873,7 +896,6 @@ }, { "cell_type": "markdown", - "id": "33", "metadata": {}, "source": [ "## Bid timing density\n", @@ -884,19 +906,24 @@ { "cell_type": "code", "execution_count": null, - "id": "34", "metadata": {}, "outputs": [], "source": [ "# Density contour with outlier markers\n", - "df_timing = df_mev_only.dropna(subset=[\"winning_bid_ms\", \"bid_to_block_ms\"]).copy()\n", - "df_timing = df_timing[(df_timing[\"bid_to_block_ms\"] > 0) & (df_timing[\"bid_to_block_ms\"] < 5000)]\n", + "df_timing = df_mev_only.filter(\n", + " pl.col(\"winning_bid_ms\").is_not_null() &\n", + " pl.col(\"bid_to_block_ms\").is_not_null() &\n", + " (pl.col(\"bid_to_block_ms\") > 0) &\n", + " (pl.col(\"bid_to_block_ms\") < 5000)\n", + ")\n", "\n", "if len(df_timing) > 0:\n", + " df_timing_pd = df_timing.to_pandas()\n", + " \n", " fig = go.Figure()\n", " \n", " # Density contour base\n", - " contour = px.density_contour(df_timing, x=\"winning_bid_ms\", y=\"bid_to_block_ms\")\n", + " contour = px.density_contour(df_timing_pd, x=\"winning_bid_ms\", y=\"bid_to_block_ms\")\n", " for trace in contour.data:\n", " trace.update(\n", " contours_coloring=\"fill\", \n", @@ -908,9 +935,11 @@ " fig.add_trace(trace)\n", " \n", " # Outliers (P95+ on either axis)\n", - " q95_x = df_timing[\"winning_bid_ms\"].quantile(0.95)\n", - " q95_y = df_timing[\"bid_to_block_ms\"].quantile(0.95)\n", - " outliers = df_timing[(df_timing[\"winning_bid_ms\"] > q95_x) | (df_timing[\"bid_to_block_ms\"] > q95_y)]\n", + " q95_x = df_timing.select(pl.col(\"winning_bid_ms\").quantile(0.95)).item()\n", + " q95_y = df_timing.select(pl.col(\"bid_to_block_ms\").quantile(0.95)).item()\n", + " outliers = df_timing.filter(\n", + " (pl.col(\"winning_bid_ms\") > q95_x) | (pl.col(\"bid_to_block_ms\") > q95_y)\n", + " ).to_pandas()\n", " \n", " fig.add_trace(go.Scatter(\n", " x=outliers[\"winning_bid_ms\"],\n", @@ -938,7 +967,6 @@ }, { "cell_type": "markdown", - "id": "35", "metadata": {}, "source": [ "## Bid timing by blob count\n", @@ -949,23 +977,28 @@ { "cell_type": "code", "execution_count": null, - "id": "36", "metadata": {}, "outputs": [], "source": [ "# Faceted heatmap by blob count (same bins as other charts)\n", - "df_timing = df_mev_only.dropna(subset=[\"winning_bid_ms\", \"bid_to_block_ms\"]).copy()\n", - "df_timing = df_timing[(df_timing[\"bid_to_block_ms\"] > 0) & (df_timing[\"bid_to_block_ms\"] < 5000)]\n", + "df_timing = df_mev_only.filter(\n", + " pl.col(\"winning_bid_ms\").is_not_null() &\n", + " pl.col(\"bid_to_block_ms\").is_not_null() &\n", + " (pl.col(\"bid_to_block_ms\") > 0) &\n", + " (pl.col(\"bid_to_block_ms\") < 5000)\n", + ")\n", "\n", "if len(df_timing) > 0:\n", + " df_timing_pd = df_timing.to_pandas()\n", + " \n", " n_rows = (len(BLOB_BIN_ORDER) + 2) // 3\n", " \n", " # Calculate axis ranges\n", - " x_max = df_timing[\"winning_bid_ms\"].quantile(0.99)\n", - " y_max = df_timing[\"bid_to_block_ms\"].quantile(0.99)\n", + " x_max = df_timing.select(pl.col(\"winning_bid_ms\").quantile(0.99)).item()\n", + " y_max = df_timing.select(pl.col(\"bid_to_block_ms\").quantile(0.99)).item()\n", "\n", " fig = px.density_heatmap(\n", - " df_timing,\n", + " df_timing_pd,\n", " x=\"winning_bid_ms\",\n", " y=\"bid_to_block_ms\",\n", " facet_col=\"blob_bin\",\n", @@ -1014,15 +1047,15 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", - "version": "3.11.0" + "version": "3.12.0" } }, "nbformat": 4, - "nbformat_minor": 5 + "nbformat_minor": 4 } diff --git a/notebooks/06-block-column-timing.ipynb b/notebooks/06-block-column-timing.ipynb index f3f4749..f20c9a7 100644 --- a/notebooks/06-block-column-timing.ipynb +++ b/notebooks/06-block-column-timing.ipynb @@ -19,7 +19,8 @@ }, "outputs": [], "source": [ - "import pandas as pd\n", + "import polars as pl\n", + "import pandas as pd # Required for plotly\n", "import numpy as np\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", @@ -50,24 +51,37 @@ "metadata": {}, "outputs": [], "source": [ - "df = load_parquet(\"block_production_timeline\", target_date)\n", + "df = pl.from_pandas(load_parquet(\"block_production_timeline\", target_date))\n", "\n", "# Flag MEV vs local blocks\n", - "df[\"has_mev\"] = df[\"winning_bid_value\"].notna()\n", - "df[\"block_type\"] = df[\"has_mev\"].map({True: \"MEV\", False: \"Local\"})\n", + "df = df.with_columns(\n", + " pl.col(\"winning_bid_value\").is_not_null().alias(\"has_mev\"),\n", + ")\n", + "df = df.with_columns(\n", + " pl.when(pl.col(\"has_mev\")).then(pl.lit(\"MEV\")).otherwise(pl.lit(\"Local\")).alias(\"block_type\"),\n", + ")\n", "\n", "# Filter to slots with blobs\n", - "df[\"has_blobs\"] = df[\"blob_count\"] > 0\n", - "df_blobs = df[df[\"has_blobs\"]].copy()\n", + "df = df.with_columns(\n", + " (pl.col(\"blob_count\") > 0).alias(\"has_blobs\"),\n", + ")\n", + "df_blobs = df.filter(pl.col(\"has_blobs\"))\n", "\n", "# Calculate block to first column delay\n", - "df_blobs = df_blobs.dropna(subset=[\"block_first_seen_ms\", \"first_column_first_seen_ms\"])\n", - "df_blobs[\"block_to_column_ms\"] = df_blobs[\"first_column_first_seen_ms\"] - df_blobs[\"block_first_seen_ms\"]\n", + "df_blobs = df_blobs.drop_nulls(subset=[\"block_first_seen_ms\", \"first_column_first_seen_ms\"])\n", + "df_blobs = df_blobs.with_columns(\n", + " (pl.col(\"first_column_first_seen_ms\") - pl.col(\"block_first_seen_ms\")).alias(\"block_to_column_ms\"),\n", + ")\n", "\n", - "print(f\"Total slots: {len(df):,}\")\n", - "print(f\"Slots with blobs: {len(df_blobs):,} ({len(df_blobs)/len(df)*100:.1f}%)\")\n", - "print(f\" MEV: {df_blobs['has_mev'].sum():,} ({df_blobs['has_mev'].mean()*100:.1f}%)\")\n", - "print(f\" Local: {(~df_blobs['has_mev']).sum():,} ({(~df_blobs['has_mev']).mean()*100:.1f}%)\")" + "total_slots = len(df)\n", + "blob_slots = len(df_blobs)\n", + "mev_count = df_blobs.filter(pl.col(\"has_mev\")).height\n", + "local_count = df_blobs.filter(~pl.col(\"has_mev\")).height\n", + "\n", + "print(f\"Total slots: {total_slots:,}\")\n", + "print(f\"Slots with blobs: {blob_slots:,} ({blob_slots/total_slots*100:.1f}%)\")\n", + "print(f\" MEV: {mev_count:,} ({mev_count/blob_slots*100:.1f}%)\")\n", + "print(f\" Local: {local_count:,} ({local_count/blob_slots*100:.1f}%)\")" ] }, { @@ -91,7 +105,7 @@ "source": [ "if len(df_blobs) > 0:\n", " fig = px.histogram(\n", - " df_blobs,\n", + " df_blobs.to_pandas(),\n", " x=\"block_to_column_ms\",\n", " color=\"block_type\",\n", " category_orders={\"block_type\": [\"MEV\", \"Local\"]},\n", @@ -121,13 +135,19 @@ "source": [ "# Summary statistics\n", "if len(df_blobs) > 0:\n", - " stats = df_blobs[\"block_to_column_ms\"].describe(percentiles=[0.5, 0.9, 0.95, 0.99])\n", + " block_to_column = df_blobs[\"block_to_column_ms\"]\n", + " median = block_to_column.median()\n", + " p90 = block_to_column.quantile(0.9)\n", + " p95 = block_to_column.quantile(0.95)\n", + " p99 = block_to_column.quantile(0.99)\n", + " max_val = block_to_column.max()\n", + " \n", " print(\"Block to first column (ms):\")\n", - " print(f\" Median: {stats['50%']:.0f}\")\n", - " print(f\" P90: {stats['90%']:.0f}\")\n", - " print(f\" P95: {stats['95%']:.0f}\")\n", - " print(f\" P99: {stats['99%']:.0f}\")\n", - " print(f\" Max: {stats['max']:.0f}\")" + " print(f\" Median: {median:.0f}\")\n", + " print(f\" P90: {p90:.0f}\")\n", + " print(f\" P95: {p95:.0f}\")\n", + " print(f\" P99: {p99:.0f}\")\n", + " print(f\" Max: {max_val:.0f}\")" ] }, { @@ -148,12 +168,13 @@ "outputs": [], "source": [ "if len(df_blobs) > 0:\n", - " df_plot = df_blobs.copy()\n", - " df_plot[\"blob_count_f\"] = df_plot[\"blob_count\"].astype(float) # Force continuous color\n", + " df_plot = df_blobs.with_columns(\n", + " pl.col(\"blob_count\").cast(pl.Float64).alias(\"blob_count_f\"),\n", + " )\n", " max_blobs = df_plot[\"blob_count\"].max()\n", " \n", " fig = px.scatter(\n", - " df_plot,\n", + " df_plot.to_pandas(),\n", " x=\"slot_start_date_time\",\n", " y=\"block_to_column_ms\",\n", " color=\"blob_count_f\",\n", @@ -192,10 +213,10 @@ "outputs": [], "source": [ "# Filter to slots with blobs (column_spread only exists for blob slots)\n", - "df_col_spread = df[df[\"blob_count\"] > 0].dropna(subset=[\"column_spread_ms\"])\n", + "df_col_spread = df.filter(pl.col(\"blob_count\") > 0).drop_nulls(subset=[\"column_spread_ms\"])\n", "if len(df_col_spread) > 0:\n", " fig = px.box(\n", - " df_col_spread,\n", + " df_col_spread.to_pandas(),\n", " x=\"blob_count\",\n", " y=\"column_spread_ms\",\n", " color=\"block_type\",\n", @@ -230,11 +251,13 @@ "metadata": {}, "outputs": [], "source": [ - "df_delay = df[df[\"blob_count\"] > 0].dropna(subset=[\"block_first_seen_ms\", \"first_column_first_seen_ms\"])\n", - "df_delay[\"block_to_column_ms\"] = df_delay[\"first_column_first_seen_ms\"] - df_delay[\"block_first_seen_ms\"]\n", + "df_delay = df.filter(pl.col(\"blob_count\") > 0).drop_nulls(subset=[\"block_first_seen_ms\", \"first_column_first_seen_ms\"])\n", + "df_delay = df_delay.with_columns(\n", + " (pl.col(\"first_column_first_seen_ms\") - pl.col(\"block_first_seen_ms\")).alias(\"block_to_column_ms\"),\n", + ")\n", "if len(df_delay) > 0:\n", " fig = px.box(\n", - " df_delay,\n", + " df_delay.to_pandas(),\n", " x=\"blob_count\",\n", " y=\"block_to_column_ms\",\n", " color=\"block_type\",\n", diff --git a/notebooks/07-propagation-anomalies.ipynb b/notebooks/07-propagation-anomalies.ipynb index 243e59e..557de69 100644 --- a/notebooks/07-propagation-anomalies.ipynb +++ b/notebooks/07-propagation-anomalies.ipynb @@ -20,6 +20,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "import polars as pl\n", "import numpy as np\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", @@ -52,15 +53,19 @@ "metadata": {}, "outputs": [], "source": [ - "df = load_parquet(\"block_production_timeline\", target_date)\n", + "df = pl.from_pandas(load_parquet(\"block_production_timeline\", target_date))\n", "\n", "# Filter to valid blocks (exclude missed slots)\n", - "df = df[df[\"block_first_seen_ms\"].notna()]\n", - "df = df[(df[\"block_first_seen_ms\"] >= 0) & (df[\"block_first_seen_ms\"] < 60000)]\n", + "df = df.filter(pl.col(\"block_first_seen_ms\").is_not_null())\n", + "df = df.filter((pl.col(\"block_first_seen_ms\") >= 0) & (pl.col(\"block_first_seen_ms\") < 60000))\n", "\n", "# Flag MEV vs local blocks\n", - "df[\"has_mev\"] = df[\"winning_bid_value\"].notna()\n", - "df[\"block_type\"] = df[\"has_mev\"].map({True: \"MEV\", False: \"Local\"})\n", + "df = df.with_columns([\n", + " pl.col(\"winning_bid_value\").is_not_null().alias(\"has_mev\"),\n", + "])\n", + "df = df.with_columns([\n", + " pl.when(pl.col(\"has_mev\")).then(pl.lit(\"MEV\")).otherwise(pl.lit(\"Local\")).alias(\"block_type\"),\n", + "])\n", "\n", "# Get max blob count for charts\n", "max_blobs = df[\"blob_count\"].max()\n", @@ -95,33 +100,43 @@ "outputs": [], "source": [ "# Conditional outliers: blocks slow relative to their blob count\n", - "df_anomaly = df.copy()\n", + "df_anomaly = df.clone()\n", "\n", - "# Fit regression: block_first_seen_ms ~ blob_count\n", - "slope, intercept, r_value, p_value, std_err = stats.linregress(\n", - " df_anomaly[\"blob_count\"].astype(float), df_anomaly[\"block_first_seen_ms\"]\n", - ")\n", + "# Fit regression: block_first_seen_ms ~ blob_count (need numpy arrays)\n", + "blob_count_arr = df_anomaly[\"blob_count\"].cast(pl.Float64).to_numpy()\n", + "block_ms_arr = df_anomaly[\"block_first_seen_ms\"].to_numpy()\n", + "\n", + "slope, intercept, r_value, p_value, std_err = stats.linregress(blob_count_arr, block_ms_arr)\n", "\n", "# Calculate expected value and residual\n", - "df_anomaly[\"expected_ms\"] = intercept + slope * df_anomaly[\"blob_count\"].astype(float)\n", - "df_anomaly[\"residual_ms\"] = df_anomaly[\"block_first_seen_ms\"] - df_anomaly[\"expected_ms\"]\n", + "df_anomaly = df_anomaly.with_columns([\n", + " (pl.lit(intercept) + pl.lit(slope) * pl.col(\"blob_count\").cast(pl.Float64)).alias(\"expected_ms\"),\n", + "])\n", + "df_anomaly = df_anomaly.with_columns([\n", + " (pl.col(\"block_first_seen_ms\") - pl.col(\"expected_ms\")).alias(\"residual_ms\"),\n", + "])\n", "\n", "# Calculate residual standard deviation\n", "residual_std = df_anomaly[\"residual_ms\"].std()\n", "\n", "# Flag anomalies: residual > 2σ (unexpectedly slow)\n", - "df_anomaly[\"is_anomaly\"] = df_anomaly[\"residual_ms\"] > 2 * residual_std\n", + "df_anomaly = df_anomaly.with_columns([\n", + " (pl.col(\"residual_ms\") > 2 * residual_std).alias(\"is_anomaly\"),\n", + "])\n", "\n", "n_anomalies = df_anomaly[\"is_anomaly\"].sum()\n", "pct_anomalies = n_anomalies / len(df_anomaly) * 100\n", "\n", "# Prepare outliers dataframe\n", - "df_outliers = df_anomaly[df_anomaly[\"is_anomaly\"]].copy()\n", - "df_outliers[\"relay\"] = df_outliers[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else \"Local\")\n", - "df_outliers[\"proposer\"] = df_outliers[\"proposer_entity\"].fillna(\"Unknown\")\n", - "df_outliers[\"builder\"] = df_outliers[\"winning_builder\"].apply(\n", - " lambda x: f\"{x[:10]}...\" if pd.notna(x) and x else \"Local\"\n", - ")\n", + "df_outliers = df_anomaly.filter(pl.col(\"is_anomaly\"))\n", + "df_outliers = df_outliers.with_columns([\n", + " pl.col(\"winning_relays\").list.get(0).fill_null(\"Local\").alias(\"relay\"),\n", + " pl.col(\"proposer_entity\").fill_null(\"Unknown\").alias(\"proposer\"),\n", + " pl.when(pl.col(\"winning_builder\").is_not_null() & (pl.col(\"winning_builder\") != \"\"))\n", + " .then(pl.col(\"winning_builder\").str.slice(0, 10) + pl.lit(\"...\"))\n", + " .otherwise(pl.lit(\"Local\"))\n", + " .alias(\"builder\"),\n", + "])\n", "\n", "print(f\"Regression: block_ms = {intercept:.1f} + {slope:.2f} × blob_count (R² = {r_value**2:.3f})\")\n", "print(f\"Residual σ = {residual_std:.1f}ms\")\n", @@ -164,13 +179,17 @@ "))\n", "\n", "# Normal points (sample to avoid overplotting)\n", - "df_normal = df_anomaly[~df_anomaly[\"is_anomaly\"]]\n", + "df_normal = df_anomaly.filter(~pl.col(\"is_anomaly\"))\n", "if len(df_normal) > 2000:\n", - " df_normal = df_normal.sample(2000, random_state=42)\n", + " df_normal = df_normal.sample(n=2000, seed=42)\n", + "\n", + "# Convert to pandas for plotly\n", + "df_normal_pd = df_normal.to_pandas()\n", + "df_outliers_pd = df_outliers.to_pandas()\n", "\n", "fig.add_trace(go.Scatter(\n", - " x=df_normal[\"blob_count\"],\n", - " y=df_normal[\"block_first_seen_ms\"],\n", + " x=df_normal_pd[\"blob_count\"],\n", + " y=df_normal_pd[\"block_first_seen_ms\"],\n", " mode=\"markers\",\n", " marker=dict(size=4, color=\"rgba(100,150,200,0.4)\"),\n", " name=f\"Normal ({len(df_anomaly) - n_anomalies:,})\",\n", @@ -179,8 +198,8 @@ "\n", "# Anomaly points\n", "fig.add_trace(go.Scatter(\n", - " x=df_outliers[\"blob_count\"],\n", - " y=df_outliers[\"block_first_seen_ms\"],\n", + " x=df_outliers_pd[\"blob_count\"],\n", + " y=df_outliers_pd[\"block_first_seen_ms\"],\n", " mode=\"markers\",\n", " marker=dict(\n", " size=7,\n", @@ -189,9 +208,9 @@ " ),\n", " name=f\"Anomalies ({n_anomalies:,})\",\n", " customdata=np.column_stack([\n", - " df_outliers[\"slot\"],\n", - " df_outliers[\"residual_ms\"].round(0),\n", - " df_outliers[\"relay\"],\n", + " df_outliers_pd[\"slot\"],\n", + " df_outliers_pd[\"residual_ms\"].round(0),\n", + " df_outliers_pd[\"relay\"],\n", " ]),\n", " hovertemplate=\"Slot %{customdata[0]}
Blobs: %{x}
Actual: %{y:.0f}ms
+%{customdata[1]}ms vs expected
Relay: %{customdata[2]}\",\n", "))\n", @@ -225,12 +244,17 @@ "source": [ "# All anomalies table with selectable text and Lab links\n", "if n_anomalies > 0:\n", - " df_table = df_outliers.sort_values(\"residual_ms\", ascending=False)[\n", - " [\"slot\", \"blob_count\", \"block_first_seen_ms\", \"expected_ms\", \"residual_ms\", \"proposer\", \"builder\", \"relay\"]\n", - " ].copy()\n", - " df_table[\"block_first_seen_ms\"] = df_table[\"block_first_seen_ms\"].round(0).astype(int)\n", - " df_table[\"expected_ms\"] = df_table[\"expected_ms\"].round(0).astype(int)\n", - " df_table[\"residual_ms\"] = df_table[\"residual_ms\"].round(0).astype(int)\n", + " df_table = df_outliers.sort(\"residual_ms\", descending=True).select([\n", + " \"slot\", \"blob_count\", \"block_first_seen_ms\", \"expected_ms\", \"residual_ms\", \"proposer\", \"builder\", \"relay\"\n", + " ])\n", + " df_table = df_table.with_columns([\n", + " pl.col(\"block_first_seen_ms\").round(0).cast(pl.Int64),\n", + " pl.col(\"expected_ms\").round(0).cast(pl.Int64),\n", + " pl.col(\"residual_ms\").round(0).cast(pl.Int64),\n", + " ])\n", + " \n", + " # Convert to pandas for iteration\n", + " df_table_pd = df_table.to_pandas()\n", " \n", " # Build HTML table\n", " html = '''\n", @@ -253,7 +277,7 @@ " \n", " '''\n", " \n", - " for _, row in df_table.iterrows():\n", + " for _, row in df_table_pd.iterrows():\n", " slot_link = f'{row[\"slot\"]}'\n", " html += f'''\n", " {slot_link}\n", @@ -292,28 +316,34 @@ "source": [ "if n_anomalies > 0:\n", " # Count anomalies by relay\n", - " relay_counts = df_outliers[\"relay\"].value_counts().reset_index()\n", - " relay_counts.columns = [\"relay\", \"anomaly_count\"]\n", + " relay_counts = df_outliers.group_by(\"relay\").agg(pl.len().alias(\"anomaly_count\"))\n", " \n", " # Get total blocks per relay for context\n", - " df_anomaly[\"relay\"] = df_anomaly[\"winning_relays\"].apply(lambda x: x[0] if len(x) > 0 else \"Local\")\n", - " total_by_relay = df_anomaly.groupby(\"relay\").size().reset_index(name=\"total_blocks\")\n", + " df_anomaly = df_anomaly.with_columns([\n", + " pl.col(\"winning_relays\").list.get(0).fill_null(\"Local\").alias(\"relay\"),\n", + " ])\n", + " total_by_relay = df_anomaly.group_by(\"relay\").agg(pl.len().alias(\"total_blocks\"))\n", " \n", - " relay_counts = relay_counts.merge(total_by_relay, on=\"relay\")\n", - " relay_counts[\"anomaly_rate\"] = relay_counts[\"anomaly_count\"] / relay_counts[\"total_blocks\"] * 100\n", - " relay_counts = relay_counts.sort_values(\"anomaly_count\", ascending=True)\n", + " relay_counts = relay_counts.join(total_by_relay, on=\"relay\")\n", + " relay_counts = relay_counts.with_columns([\n", + " (pl.col(\"anomaly_count\") / pl.col(\"total_blocks\") * 100).alias(\"anomaly_rate\"),\n", + " ])\n", + " relay_counts = relay_counts.sort(\"anomaly_count\")\n", + " \n", + " # Convert to pandas for plotly\n", + " relay_counts_pd = relay_counts.to_pandas()\n", " \n", " fig = go.Figure()\n", " \n", " fig.add_trace(go.Bar(\n", - " y=relay_counts[\"relay\"],\n", - " x=relay_counts[\"anomaly_count\"],\n", + " y=relay_counts_pd[\"relay\"],\n", + " x=relay_counts_pd[\"anomaly_count\"],\n", " orientation=\"h\",\n", " marker_color=\"#e74c3c\",\n", - " text=relay_counts.apply(lambda r: f\"{r['anomaly_count']} ({r['anomaly_rate']:.1f}%)\", axis=1),\n", + " text=relay_counts_pd.apply(lambda r: f\"{r['anomaly_count']} ({r['anomaly_rate']:.1f}%)\", axis=1),\n", " textposition=\"outside\",\n", " hovertemplate=\"%{y}
Anomalies: %{x}
Total blocks: %{customdata[0]:,}
Rate: %{customdata[1]:.1f}%\",\n", - " customdata=np.column_stack([relay_counts[\"total_blocks\"], relay_counts[\"anomaly_rate\"]]),\n", + " customdata=np.column_stack([relay_counts_pd[\"total_blocks\"], relay_counts_pd[\"anomaly_rate\"]]),\n", " ))\n", " \n", " fig.update_layout(\n", @@ -344,30 +374,36 @@ "source": [ "if n_anomalies > 0:\n", " # Count anomalies by proposer entity\n", - " proposer_counts = df_outliers[\"proposer\"].value_counts().reset_index()\n", - " proposer_counts.columns = [\"proposer\", \"anomaly_count\"]\n", + " proposer_counts = df_outliers.group_by(\"proposer\").agg(pl.len().alias(\"anomaly_count\"))\n", " \n", " # Get total blocks per proposer for context\n", - " df_anomaly[\"proposer\"] = df_anomaly[\"proposer_entity\"].fillna(\"Unknown\")\n", - " total_by_proposer = df_anomaly.groupby(\"proposer\").size().reset_index(name=\"total_blocks\")\n", + " df_anomaly = df_anomaly.with_columns([\n", + " pl.col(\"proposer_entity\").fill_null(\"Unknown\").alias(\"proposer\"),\n", + " ])\n", + " total_by_proposer = df_anomaly.group_by(\"proposer\").agg(pl.len().alias(\"total_blocks\"))\n", " \n", - " proposer_counts = proposer_counts.merge(total_by_proposer, on=\"proposer\")\n", - " proposer_counts[\"anomaly_rate\"] = proposer_counts[\"anomaly_count\"] / proposer_counts[\"total_blocks\"] * 100\n", + " proposer_counts = proposer_counts.join(total_by_proposer, on=\"proposer\")\n", + " proposer_counts = proposer_counts.with_columns([\n", + " (pl.col(\"anomaly_count\") / pl.col(\"total_blocks\") * 100).alias(\"anomaly_rate\"),\n", + " ])\n", " \n", " # Show top 15 by anomaly count\n", - " proposer_counts = proposer_counts.nlargest(15, \"anomaly_count\").sort_values(\"anomaly_count\", ascending=True)\n", + " proposer_counts = proposer_counts.sort(\"anomaly_count\", descending=True).head(15).sort(\"anomaly_count\")\n", + " \n", + " # Convert to pandas for plotly\n", + " proposer_counts_pd = proposer_counts.to_pandas()\n", " \n", " fig = go.Figure()\n", " \n", " fig.add_trace(go.Bar(\n", - " y=proposer_counts[\"proposer\"],\n", - " x=proposer_counts[\"anomaly_count\"],\n", + " y=proposer_counts_pd[\"proposer\"],\n", + " x=proposer_counts_pd[\"anomaly_count\"],\n", " orientation=\"h\",\n", " marker_color=\"#e74c3c\",\n", - " text=proposer_counts.apply(lambda r: f\"{r['anomaly_count']} ({r['anomaly_rate']:.1f}%)\", axis=1),\n", + " text=proposer_counts_pd.apply(lambda r: f\"{r['anomaly_count']} ({r['anomaly_rate']:.1f}%)\", axis=1),\n", " textposition=\"outside\",\n", " hovertemplate=\"%{y}
Anomalies: %{x}
Total blocks: %{customdata[0]:,}
Rate: %{customdata[1]:.1f}%\",\n", - " customdata=np.column_stack([proposer_counts[\"total_blocks\"], proposer_counts[\"anomaly_rate\"]]),\n", + " customdata=np.column_stack([proposer_counts_pd[\"total_blocks\"], proposer_counts_pd[\"anomaly_rate\"]]),\n", " ))\n", " \n", " fig.update_layout(\n", @@ -398,32 +434,39 @@ "source": [ "if n_anomalies > 0:\n", " # Count anomalies by builder\n", - " builder_counts = df_outliers[\"builder\"].value_counts().reset_index()\n", - " builder_counts.columns = [\"builder\", \"anomaly_count\"]\n", + " builder_counts = df_outliers.group_by(\"builder\").agg(pl.len().alias(\"anomaly_count\"))\n", " \n", " # Get total blocks per builder for context\n", - " df_anomaly[\"builder\"] = df_anomaly[\"winning_builder\"].apply(\n", - " lambda x: f\"{x[:10]}...\" if pd.notna(x) and x else \"Local\"\n", - " )\n", - " total_by_builder = df_anomaly.groupby(\"builder\").size().reset_index(name=\"total_blocks\")\n", + " df_anomaly = df_anomaly.with_columns([\n", + " pl.when(pl.col(\"winning_builder\").is_not_null() & (pl.col(\"winning_builder\") != \"\"))\n", + " .then(pl.col(\"winning_builder\").str.slice(0, 10) + pl.lit(\"...\"))\n", + " .otherwise(pl.lit(\"Local\"))\n", + " .alias(\"builder\"),\n", + " ])\n", + " total_by_builder = df_anomaly.group_by(\"builder\").agg(pl.len().alias(\"total_blocks\"))\n", " \n", - " builder_counts = builder_counts.merge(total_by_builder, on=\"builder\")\n", - " builder_counts[\"anomaly_rate\"] = builder_counts[\"anomaly_count\"] / builder_counts[\"total_blocks\"] * 100\n", + " builder_counts = builder_counts.join(total_by_builder, on=\"builder\")\n", + " builder_counts = builder_counts.with_columns([\n", + " (pl.col(\"anomaly_count\") / pl.col(\"total_blocks\") * 100).alias(\"anomaly_rate\"),\n", + " ])\n", " \n", " # Show top 15 by anomaly count\n", - " builder_counts = builder_counts.nlargest(15, \"anomaly_count\").sort_values(\"anomaly_count\", ascending=True)\n", + " builder_counts = builder_counts.sort(\"anomaly_count\", descending=True).head(15).sort(\"anomaly_count\")\n", + " \n", + " # Convert to pandas for plotly\n", + " builder_counts_pd = builder_counts.to_pandas()\n", " \n", " fig = go.Figure()\n", " \n", " fig.add_trace(go.Bar(\n", - " y=builder_counts[\"builder\"],\n", - " x=builder_counts[\"anomaly_count\"],\n", + " y=builder_counts_pd[\"builder\"],\n", + " x=builder_counts_pd[\"anomaly_count\"],\n", " orientation=\"h\",\n", " marker_color=\"#e74c3c\",\n", - " text=builder_counts.apply(lambda r: f\"{r['anomaly_count']} ({r['anomaly_rate']:.1f}%)\", axis=1),\n", + " text=builder_counts_pd.apply(lambda r: f\"{r['anomaly_count']} ({r['anomaly_rate']:.1f}%)\", axis=1),\n", " textposition=\"outside\",\n", " hovertemplate=\"%{y}
Anomalies: %{x}
Total blocks: %{customdata[0]:,}
Rate: %{customdata[1]:.1f}%\",\n", - " customdata=np.column_stack([builder_counts[\"total_blocks\"], builder_counts[\"anomaly_rate\"]]),\n", + " customdata=np.column_stack([builder_counts_pd[\"total_blocks\"], builder_counts_pd[\"anomaly_rate\"]]),\n", " ))\n", " \n", " fig.update_layout(\n", @@ -454,21 +497,26 @@ "source": [ "if n_anomalies > 0:\n", " # Count anomalies by blob count\n", - " blob_anomalies = df_outliers.groupby(\"blob_count\").size().reset_index(name=\"anomaly_count\")\n", - " blob_total = df_anomaly.groupby(\"blob_count\").size().reset_index(name=\"total_blocks\")\n", + " blob_anomalies = df_outliers.group_by(\"blob_count\").agg(pl.len().alias(\"anomaly_count\"))\n", + " blob_total = df_anomaly.group_by(\"blob_count\").agg(pl.len().alias(\"total_blocks\"))\n", + " \n", + " blob_stats = blob_total.join(blob_anomalies, on=\"blob_count\", how=\"left\").fill_null(0)\n", + " blob_stats = blob_stats.with_columns([\n", + " pl.col(\"anomaly_count\").cast(pl.Int64),\n", + " (pl.col(\"anomaly_count\") / pl.col(\"total_blocks\") * 100).alias(\"anomaly_rate\"),\n", + " ])\n", " \n", - " blob_stats = blob_total.merge(blob_anomalies, on=\"blob_count\", how=\"left\").fillna(0)\n", - " blob_stats[\"anomaly_count\"] = blob_stats[\"anomaly_count\"].astype(int)\n", - " blob_stats[\"anomaly_rate\"] = blob_stats[\"anomaly_count\"] / blob_stats[\"total_blocks\"] * 100\n", + " # Convert to pandas for plotly\n", + " blob_stats_pd = blob_stats.to_pandas()\n", " \n", " fig = go.Figure()\n", " \n", " fig.add_trace(go.Bar(\n", - " x=blob_stats[\"blob_count\"],\n", - " y=blob_stats[\"anomaly_count\"],\n", + " x=blob_stats_pd[\"blob_count\"],\n", + " y=blob_stats_pd[\"anomaly_count\"],\n", " marker_color=\"#e74c3c\",\n", " hovertemplate=\"%{x} blobs
Anomalies: %{y}
Total: %{customdata[0]:,}
Rate: %{customdata[1]:.1f}%\",\n", - " customdata=np.column_stack([blob_stats[\"total_blocks\"], blob_stats[\"anomaly_rate\"]]),\n", + " customdata=np.column_stack([blob_stats_pd[\"total_blocks\"], blob_stats_pd[\"anomaly_rate\"]]),\n", " ))\n", " \n", " fig.update_layout(\n", diff --git a/notebooks/08-missed-slots.ipynb b/notebooks/08-missed-slots.ipynb index a2a69d8..1cc85ac 100644 --- a/notebooks/08-missed-slots.ipynb +++ b/notebooks/08-missed-slots.ipynb @@ -23,6 +23,7 @@ }, "outputs": [], "source": [ + "import polars as pl\n", "import pandas as pd\n", "import numpy as np\n", "import plotly.express as px\n", @@ -55,12 +56,14 @@ "metadata": {}, "outputs": [], "source": [ - "df = load_parquet(\"block_production_timeline\", target_date)\n", + "df = pl.from_pandas(load_parquet(\"block_production_timeline\", target_date))\n", "\n", "# Identify missed slots: ClickHouse LEFT JOIN returns epoch date (1970-01-01) instead of NULL\n", "# for non-matching rows, so we detect missed slots by checking for the epoch timestamp\n", - "epoch = pd.Timestamp(\"1970-01-01\")\n", - "df[\"is_missed\"] = df[\"block_first_seen\"] == epoch\n", + "epoch = pl.datetime(1970, 1, 1)\n", + "df = df.with_columns(\n", + " (pl.col(\"block_first_seen\") == epoch).alias(\"is_missed\")\n", + ")\n", "\n", "total_slots = len(df)\n", "missed_slots = df[\"is_missed\"].sum()\n", @@ -90,29 +93,37 @@ "outputs": [], "source": [ "# Missed slots by entity\n", - "df_missed = df[df[\"is_missed\"]].copy()\n", + "df_missed = df.filter(pl.col(\"is_missed\"))\n", "\n", "if len(df_missed) > 0:\n", " # Fill empty entities\n", - " df_missed[\"proposer_entity\"] = df_missed[\"proposer_entity\"].fillna(\"unknown\").replace(\"\", \"unknown\")\n", + " df_missed = df_missed.with_columns(\n", + " pl.col(\"proposer_entity\").fill_null(\"unknown\").replace(\"\", \"unknown\").alias(\"proposer_entity\")\n", + " )\n", + " \n", + " entity_misses = (\n", + " df_missed.group_by(\"proposer_entity\")\n", + " .agg(pl.len().alias(\"missed_count\"))\n", + " .sort(\"missed_count\")\n", + " )\n", " \n", - " entity_misses = df_missed.groupby(\"proposer_entity\").size().reset_index(name=\"missed_count\")\n", - " entity_misses = entity_misses.sort_values(\"missed_count\", ascending=True)\n", + " # Convert to pandas for plotting\n", + " entity_misses_pd = entity_misses.to_pandas()\n", " \n", " fig = go.Figure()\n", " fig.add_trace(go.Bar(\n", - " y=entity_misses[\"proposer_entity\"],\n", - " x=entity_misses[\"missed_count\"],\n", + " y=entity_misses_pd[\"proposer_entity\"],\n", + " x=entity_misses_pd[\"missed_count\"],\n", " orientation=\"h\",\n", " marker_color=\"#e74c3c\",\n", - " text=entity_misses[\"missed_count\"],\n", + " text=entity_misses_pd[\"missed_count\"],\n", " textposition=\"outside\",\n", " ))\n", " fig.update_layout(\n", " margin=dict(l=150, r=50, t=30, b=60),\n", " xaxis=dict(title=\"Missed slots\"),\n", " yaxis=dict(title=\"\"),\n", - " height=max(300, len(entity_misses) * 25 + 100),\n", + " height=max(300, len(entity_misses_pd) * 25 + 100),\n", " )\n", " fig.show(config={\"responsive\": True})\n", "else:\n", @@ -138,35 +149,52 @@ "source": [ "if len(df_missed) > 0:\n", " # Calculate miss rate per entity\n", - " df[\"proposer_entity_clean\"] = df[\"proposer_entity\"].fillna(\"unknown\").replace(\"\", \"unknown\")\n", + " df_with_entity = df.with_columns(\n", + " pl.col(\"proposer_entity\").fill_null(\"unknown\").replace(\"\", \"unknown\").alias(\"proposer_entity_clean\")\n", + " )\n", " \n", - " entity_stats = df.groupby(\"proposer_entity_clean\").agg(\n", - " total_slots=(\"slot\", \"count\"),\n", - " missed_slots=(\"is_missed\", \"sum\")\n", - " ).reset_index()\n", - " entity_stats[\"miss_rate\"] = entity_stats[\"missed_slots\"] / entity_stats[\"total_slots\"] * 100\n", + " entity_stats = (\n", + " df_with_entity.group_by(\"proposer_entity_clean\")\n", + " .agg(\n", + " pl.len().alias(\"total_slots\"),\n", + " pl.col(\"is_missed\").sum().alias(\"missed_slots\")\n", + " )\n", + " .with_columns(\n", + " (pl.col(\"missed_slots\") / pl.col(\"total_slots\") * 100).alias(\"miss_rate\")\n", + " )\n", + " )\n", " \n", " # Only show entities with at least 1 missed slot\n", - " entity_stats = entity_stats[entity_stats[\"missed_slots\"] > 0]\n", - " entity_stats = entity_stats.sort_values(\"miss_rate\", ascending=True)\n", + " entity_stats = (\n", + " entity_stats.filter(pl.col(\"missed_slots\") > 0)\n", + " .sort(\"miss_rate\")\n", + " )\n", + " \n", + " # Convert to pandas for plotting\n", + " entity_stats_pd = entity_stats.to_pandas()\n", + " \n", + " # Create text labels\n", + " entity_stats_pd[\"label\"] = entity_stats_pd.apply(\n", + " lambda r: f\"{r['miss_rate']:.1f}% ({int(r['missed_slots'])}/{int(r['total_slots'])})\", axis=1\n", + " )\n", " \n", " # Color by miss rate\n", " fig = go.Figure()\n", " fig.add_trace(go.Bar(\n", - " y=entity_stats[\"proposer_entity_clean\"],\n", - " x=entity_stats[\"miss_rate\"],\n", + " y=entity_stats_pd[\"proposer_entity_clean\"],\n", + " x=entity_stats_pd[\"miss_rate\"],\n", " orientation=\"h\",\n", - " marker_color=entity_stats[\"miss_rate\"],\n", + " marker_color=entity_stats_pd[\"miss_rate\"],\n", " marker_colorscale=\"YlOrRd\",\n", - " text=entity_stats.apply(lambda r: f\"{r['miss_rate']:.1f}% ({int(r['missed_slots'])}/{int(r['total_slots'])})\", axis=1),\n", + " text=entity_stats_pd[\"label\"],\n", " textposition=\"outside\",\n", " hovertemplate=\"%{y}
Miss rate: %{x:.2f}%\",\n", " ))\n", " fig.update_layout(\n", " margin=dict(l=150, r=100, t=30, b=60),\n", - " xaxis=dict(title=\"Miss rate (%)\", range=[0, max(entity_stats[\"miss_rate\"]) * 1.3]),\n", + " xaxis=dict(title=\"Miss rate (%)\", range=[0, max(entity_stats_pd[\"miss_rate\"]) * 1.3]),\n", " yaxis=dict(title=\"\"),\n", - " height=max(300, len(entity_stats) * 25 + 100),\n", + " height=max(300, len(entity_stats_pd) * 25 + 100),\n", " )\n", " fig.show(config={\"responsive\": True})" ] @@ -190,19 +218,30 @@ "source": [ "if len(df_missed) > 0:\n", " # Extract hour from slot time\n", - " df_missed[\"hour\"] = pd.to_datetime(df_missed[\"slot_start_date_time\"]).dt.hour\n", + " df_missed_hourly = df_missed.with_columns(\n", + " pl.col(\"slot_start_date_time\").dt.hour().alias(\"hour\")\n", + " )\n", " \n", - " hourly_misses = df_missed.groupby(\"hour\").size().reset_index(name=\"missed_count\")\n", + " hourly_misses = (\n", + " df_missed_hourly.group_by(\"hour\")\n", + " .agg(pl.len().alias(\"missed_count\"))\n", + " )\n", " \n", " # Fill in missing hours with 0\n", - " all_hours = pd.DataFrame({\"hour\": range(24)})\n", - " hourly_misses = all_hours.merge(hourly_misses, on=\"hour\", how=\"left\").fillna(0)\n", - " hourly_misses[\"missed_count\"] = hourly_misses[\"missed_count\"].astype(int)\n", + " all_hours = pl.DataFrame({\"hour\": range(24)})\n", + " hourly_misses = (\n", + " all_hours.join(hourly_misses, on=\"hour\", how=\"left\")\n", + " .fill_null(0)\n", + " .sort(\"hour\")\n", + " )\n", + " \n", + " # Convert to pandas for plotting\n", + " hourly_misses_pd = hourly_misses.to_pandas()\n", " \n", " fig = go.Figure()\n", " fig.add_trace(go.Bar(\n", - " x=hourly_misses[\"hour\"],\n", - " y=hourly_misses[\"missed_count\"],\n", + " x=hourly_misses_pd[\"hour\"],\n", + " y=hourly_misses_pd[\"missed_count\"],\n", " marker_color=\"#e74c3c\",\n", " ))\n", " fig.update_layout(\n", @@ -232,7 +271,8 @@ "outputs": [], "source": [ "if len(df_missed) > 0:\n", - " df_plot = df_missed.copy()\n", + " # Convert to pandas for plotting (plotly needs pandas for datetime handling)\n", + " df_plot = df_missed.select([\"slot\", \"slot_start_date_time\", \"proposer_entity\"]).to_pandas()\n", " df_plot[\"time\"] = pd.to_datetime(df_plot[\"slot_start_date_time\"])\n", " \n", " fig = go.Figure()\n", @@ -271,14 +311,23 @@ "outputs": [], "source": [ "if len(df_missed) > 0:\n", - " df_table = df_missed[[\"slot\", \"slot_start_date_time\", \"proposer_entity\"]].copy()\n", - " df_table[\"proposer_entity\"] = df_table[\"proposer_entity\"].fillna(\"unknown\").replace(\"\", \"unknown\")\n", - " df_table[\"time\"] = pd.to_datetime(df_table[\"slot_start_date_time\"]).dt.strftime(\"%H:%M:%S\")\n", - " df_table = df_table.sort_values(\"slot\")\n", + " df_table = (\n", + " df_missed.select([\"slot\", \"slot_start_date_time\", \"proposer_entity\"])\n", + " .with_columns(\n", + " pl.col(\"proposer_entity\").fill_null(\"unknown\").replace(\"\", \"unknown\")\n", + " )\n", + " .with_columns(\n", + " pl.col(\"slot_start_date_time\").dt.strftime(\"%H:%M:%S\").alias(\"time\")\n", + " )\n", + " .sort(\"slot\")\n", + " )\n", " \n", " # Create Lab links\n", - " df_table[\"lab_link\"] = df_table[\"slot\"].apply(\n", - " lambda s: f'View'\n", + " df_table = df_table.with_columns(\n", + " pl.col(\"slot\").map_elements(\n", + " lambda s: f'View',\n", + " return_dtype=pl.Utf8\n", + " ).alias(\"lab_link\")\n", " )\n", " \n", " # Build HTML table\n", @@ -300,7 +349,7 @@ " \n", " '''\n", " \n", - " for _, row in df_table.iterrows():\n", + " for row in df_table.iter_rows(named=True):\n", " html += f'''\n", " {row[\"slot\"]}\n", " {row[\"time\"]}\n", diff --git a/notebooks/09-network-overview.ipynb b/notebooks/09-network-overview.ipynb index 5955cf5..9d249e0 100644 --- a/notebooks/09-network-overview.ipynb +++ b/notebooks/09-network-overview.ipynb @@ -19,7 +19,7 @@ }, "outputs": [], "source": [ - "import pandas as pd\n", + "import polars as pl\n", "import plotly.express as px\n", "\n", "from loaders import load_parquet, display_sql\n", @@ -48,7 +48,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = load_parquet(\"xatu_client_connectivity\", target_date)" + "df = pl.from_pandas(load_parquet(\"xatu_client_connectivity\", target_date))" ] }, { @@ -68,11 +68,15 @@ "metadata": {}, "outputs": [], "source": [ - "df_unique = df.groupby(\"hour_bucket\")[\"peer_id\"].nunique().reset_index()\n", - "df_unique.columns = [\"hour_bucket\", \"unique_peers\"]\n", + "df_unique = (\n", + " df\n", + " .group_by(\"hour_bucket\")\n", + " .agg(unique_peers=pl.col(\"peer_id\").n_unique())\n", + " .sort(\"hour_bucket\")\n", + ")\n", "\n", "fig = px.line(\n", - " df_unique,\n", + " df_unique.to_pandas(),\n", " x=\"hour_bucket\",\n", " y=\"unique_peers\",\n", ")\n", @@ -101,15 +105,18 @@ "metadata": {}, "outputs": [], "source": [ - "# Filter out empty client names and deduplicate per hour/peer\n", - "df_clients = df[df[\"client_name\"].notna() & (df[\"client_name\"] != \"\")].copy()\n", - "df_clients = df_clients.sort_values([\"hour_bucket\", \"peer_id\", \"client_name\"], ascending=[True, True, False])\n", - "df_clients = df_clients.drop_duplicates(subset=[\"hour_bucket\", \"peer_id\"], keep=\"first\")\n", - "\n", - "df_client_counts = df_clients.groupby([\"hour_bucket\", \"client_name\"]).size().reset_index(name=\"peers\")\n", + "df_clients = (\n", + " df\n", + " .filter(pl.col(\"client_name\").is_not_null() & (pl.col(\"client_name\") != \"\"))\n", + " .sort([\"hour_bucket\", \"peer_id\", \"client_name\"], descending=[False, False, True])\n", + " .unique(subset=[\"hour_bucket\", \"peer_id\"], keep=\"first\")\n", + " .group_by([\"hour_bucket\", \"client_name\"])\n", + " .agg(peers=pl.len())\n", + " .sort([\"hour_bucket\", \"peers\"])\n", + ")\n", "\n", "fig = px.area(\n", - " df_client_counts,\n", + " df_clients.to_pandas(),\n", " x=\"hour_bucket\",\n", " y=\"peers\",\n", " color=\"client_name\",\n", @@ -140,12 +147,18 @@ "metadata": {}, "outputs": [], "source": [ - "df_xatu = df.groupby([\"hour_bucket\", \"local_name\"])[\"peer_id\"].nunique().reset_index()\n", - "df_xatu.columns = [\"hour_bucket\", \"local_name\", \"peers\"]\n", - "df_xatu[\"local_name\"] = df_xatu[\"local_name\"].str.replace(\"ethpandaops/mainnet/\", \"\", regex=False)\n", + "df_xatu = (\n", + " df\n", + " .group_by([\"hour_bucket\", \"local_name\"])\n", + " .agg(peers=pl.col(\"peer_id\").n_unique())\n", + " .sort(\"hour_bucket\")\n", + " .with_columns(\n", + " pl.col(\"local_name\").str.replace(\"ethpandaops/mainnet/\", \"\")\n", + " )\n", + ")\n", "\n", "fig = px.line(\n", - " df_xatu,\n", + " df_xatu.to_pandas(),\n", " x=\"hour_bucket\",\n", " y=\"peers\",\n", " color=\"local_name\",\n", @@ -183,18 +196,22 @@ "metadata": {}, "outputs": [], "source": [ - "# Group transports per peer/hour/protocol\n", "df_transport = (\n", - " df.groupby([\"hour_bucket\", \"peer_id\", \"protocol\"])[\"transport_protocol\"]\n", - " .apply(lambda x: \" & \".join(sorted(x.dropna().unique())))\n", - " .reset_index()\n", + " df\n", + " .group_by([\"hour_bucket\", \"peer_id\", \"protocol\"])\n", + " .agg(\n", + " all_transports=pl.col(\"transport_protocol\").unique().sort().str.join(\" & \")\n", + " )\n", + " .with_columns(\n", + " protocol_combos=pl.col(\"protocol\") + \" + (\" + pl.col(\"all_transports\") + \")\"\n", + " )\n", + " .group_by([\"hour_bucket\", \"protocol_combos\"])\n", + " .agg(peers=pl.len())\n", + " .sort(\"hour_bucket\")\n", ")\n", - "df_transport[\"protocol_combos\"] = df_transport[\"protocol\"] + \" + (\" + df_transport[\"transport_protocol\"] + \")\"\n", - "\n", - "df_proto_counts = df_transport.groupby([\"hour_bucket\", \"protocol_combos\"]).size().reset_index(name=\"peers\")\n", "\n", "fig = px.line(\n", - " df_proto_counts,\n", + " df_transport.to_pandas(),\n", " x=\"hour_bucket\",\n", " y=\"peers\",\n", " color=\"protocol_combos\",\n", @@ -225,13 +242,18 @@ "metadata": {}, "outputs": [], "source": [ - "# Count unique peers per port\n", - "df_ports = df.drop_duplicates(subset=[\"peer_id\", \"port\"]).groupby(\"port\").size().reset_index(name=\"peers\")\n", - "df_ports = df_ports.sort_values(\"peers\", ascending=False).head(20)\n", - "df_ports[\"port\"] = df_ports[\"port\"].astype(str)\n", + "df_ports = (\n", + " df\n", + " .unique(subset=[\"peer_id\", \"port\"])\n", + " .group_by(\"port\")\n", + " .agg(peers=pl.len())\n", + " .with_columns(pl.col(\"port\").cast(pl.Utf8))\n", + " .sort(\"peers\", descending=True)\n", + " .head(20)\n", + ")\n", "\n", "fig = px.bar(\n", - " df_ports,\n", + " df_ports.to_pandas(),\n", " x=\"port\",\n", " y=\"peers\",\n", ")\n", diff --git a/pyproject.toml b/pyproject.toml index 628256f..8ecd79d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ requires-python = ">=3.13" dependencies = [ "clickhouse-connect>=0.8.0", "pandas>=2.0", + "polars>=1.0", "plotly>=5.0", "altair>=5.0", "numpy>=1.26", diff --git a/uv.lock b/uv.lock index e01f550..988b8b4 100644 --- a/uv.lock +++ b/uv.lock @@ -1213,6 +1213,7 @@ dependencies = [ { name = "pandas" }, { name = "papermill" }, { name = "plotly" }, + { name = "polars" }, { name = "pyarrow" }, { name = "python-dotenv" }, { name = "pyyaml" }, @@ -1239,6 +1240,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.0" }, { name = "papermill", specifier = ">=2.6.0" }, { name = "plotly", specifier = ">=5.0" }, + { name = "polars", specifier = ">=1.0" }, { name = "pyarrow", specifier = ">=22.0.0" }, { name = "python-dotenv", specifier = ">=1.0" }, { name = "pyyaml", specifier = ">=6.0.3" }, @@ -1288,6 +1290,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/c3/3031c931098de393393e1f93a38dc9ed6805d86bb801acc3cf2d5bd1e6b7/plotly-6.5.0-py3-none-any.whl", hash = "sha256:5ac851e100367735250206788a2b1325412aa4a4917a4fe3e6f0bc5aa6f3d90a", size = 9893174, upload-time = "2025-11-17T18:39:20.351Z" }, ] +[[package]] +name = "polars" +version = "1.37.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "polars-runtime-32" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/84/ae/dfebf31b9988c20998140b54d5b521f64ce08879f2c13d9b4d44d7c87e32/polars-1.37.1.tar.gz", hash = "sha256:0309e2a4633e712513401964b4d95452f124ceabf7aec6db50affb9ced4a274e", size = 715572, upload-time = "2026-01-12T23:27:03.267Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/75/ec73e38812bca7c2240aff481b9ddff20d1ad2f10dee4b3353f5eeaacdab/polars-1.37.1-py3-none-any.whl", hash = "sha256:377fed8939a2f1223c1563cfabdc7b4a3d6ff846efa1f2ddeb8644fafd9b1aff", size = 805749, upload-time = "2026-01-12T23:25:48.595Z" }, +] + +[[package]] +name = "polars-runtime-32" +version = "1.37.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/40/0b/addabe5e8d28a5a4c9887a08907be7ddc3fce892dc38f37d14b055438a57/polars_runtime_32-1.37.1.tar.gz", hash = "sha256:68779d4a691da20a5eb767d74165a8f80a2bdfbde4b54acf59af43f7fa028d8f", size = 2818945, upload-time = "2026-01-12T23:27:04.653Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/a2/e828ea9f845796de02d923edb790e408ca0b560cd68dbd74bb99a1b3c461/polars_runtime_32-1.37.1-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:0b8d4d73ea9977d3731927740e59d814647c5198bdbe359bcf6a8bfce2e79771", size = 43499912, upload-time = "2026-01-12T23:25:51.182Z" }, + { url = "https://files.pythonhosted.org/packages/7e/46/81b71b7aa9e3703ee6e4ef1f69a87e40f58ea7c99212bf49a95071e99c8c/polars_runtime_32-1.37.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:c682bf83f5f352e5e02f5c16c652c48ca40442f07b236f30662b22217320ce76", size = 39695707, upload-time = "2026-01-12T23:25:54.289Z" }, + { url = "https://files.pythonhosted.org/packages/81/2e/20009d1fde7ee919e24040f5c87cb9d0e4f8e3f109b74ba06bc10c02459c/polars_runtime_32-1.37.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc82b5bbe70ca1a4b764eed1419f6336752d6ba9fc1245388d7f8b12438afa2c", size = 41467034, upload-time = "2026-01-12T23:25:56.925Z" }, + { url = "https://files.pythonhosted.org/packages/eb/21/9b55bea940524324625b1e8fd96233290303eb1bf2c23b54573487bbbc25/polars_runtime_32-1.37.1-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8362d11ac5193b994c7e9048ffe22ccfb976699cfbf6e128ce0302e06728894", size = 45142711, upload-time = "2026-01-12T23:26:00.817Z" }, + { url = "https://files.pythonhosted.org/packages/8c/25/c5f64461aeccdac6834a89f826d051ccd3b4ce204075e562c87a06ed2619/polars_runtime_32-1.37.1-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:04f5d5a2f013dca7391b7d8e7672fa6d37573a87f1d45d3dd5f0d9b5565a4b0f", size = 41638564, upload-time = "2026-01-12T23:26:04.186Z" }, + { url = "https://files.pythonhosted.org/packages/35/af/509d3cf6c45e764ccf856beaae26fc34352f16f10f94a7839b1042920a73/polars_runtime_32-1.37.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:fbfde7c0ca8209eeaed546e4a32cca1319189aa61c5f0f9a2b4494262bd0c689", size = 44721136, upload-time = "2026-01-12T23:26:07.088Z" }, + { url = "https://files.pythonhosted.org/packages/af/d1/5c0a83a625f72beef59394bebc57d12637997632a4f9d3ab2ffc2cc62bbf/polars_runtime_32-1.37.1-cp310-abi3-win_amd64.whl", hash = "sha256:da3d3642ae944e18dd17109d2a3036cb94ce50e5495c5023c77b1599d4c861bc", size = 44948288, upload-time = "2026-01-12T23:26:10.214Z" }, + { url = "https://files.pythonhosted.org/packages/10/f3/061bb702465904b6502f7c9081daee34b09ccbaa4f8c94cf43a2a3b6dd6f/polars_runtime_32-1.37.1-cp310-abi3-win_arm64.whl", hash = "sha256:55f2c4847a8d2e267612f564de7b753a4bde3902eaabe7b436a0a4abf75949a0", size = 41001914, upload-time = "2026-01-12T23:26:12.997Z" }, +] + [[package]] name = "prometheus-client" version = "0.23.1"