diff --git a/notebooks/04-mempool-visibility.ipynb b/notebooks/04-mempool-visibility.ipynb index 22acdd8..7d3dd99 100644 --- a/notebooks/04-mempool-visibility.ipynb +++ b/notebooks/04-mempool-visibility.ipynb @@ -4,7 +4,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Analysis of transaction visibility in the public mempool before block inclusion on Ethereum mainnet." + "Analysis of transaction visibility in the public mempool before block inclusion on Ethereum mainnet.\n", + "\n", + "**Methodology:** A transaction is counted as \"seen in mempool\" only if it was observed by our sentries *before* the slot start time of the block that included it. This corrects for transactions that appear in the mempool after block propagation." ] }, { @@ -16,7 +18,41 @@ ] }, "outputs": [], - "source": "import pandas as pd\nimport plotly.express as px\nimport plotly.graph_objects as go\n\nfrom loaders import load_parquet, display_sql\n\n# Transaction type labels\nTX_TYPE_LABELS = {\n 0: \"Legacy\",\n 1: \"Access list\",\n 2: \"EIP-1559\",\n 3: \"Blob\",\n 4: \"EIP-7702\",\n}\n\nTX_TYPE_COLORS = {\n 0: \"#636EFA\",\n 1: \"#EF553B\",\n 2: \"#00CC96\",\n 3: \"#AB63FA\",\n 4: \"#FFA15A\",\n}\n\ntarget_date = None # Set via papermill, or auto-detect from manifest" + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import plotly.express as px\n", + "import plotly.graph_objects as go\n", + "from plotly.subplots import make_subplots\n", + "\n", + "from loaders import load_parquet, display_sql\n", + "\n", + "# Transaction type labels and colors\n", + "TX_TYPE_LABELS = {\n", + " 0: \"Legacy\",\n", + " 1: \"Access list\",\n", + " 2: \"EIP-1559\",\n", + " 3: \"Blob\",\n", + " 4: \"EIP-7702\",\n", + "}\n", + "\n", + "TX_TYPE_COLORS = {\n", + " 0: \"#636EFA\",\n", + " 1: \"#EF553B\",\n", + " 2: \"#00CC96\",\n", + " 3: \"#AB63FA\",\n", + " 4: \"#FFA15A\",\n", + "}\n", + "\n", + "# Histogram bucket labels (log2 seconds, up to 1 hour)\n", + "HIST_LABELS = [\n", + " \"<0.5s\", \"0.5-1s\", \"1-2s\", \"2-4s\", \"4-8s\", \"8-16s\",\n", + " \"16-32s\", \"32s-1m\", \"1-2m\", \"2-4m\", \"4-8m\", \"8-17m\",\n", + " \"17-34m\", \"34-60m\", \">=1h\"\n", + "]\n", + "\n", + "target_date = None # Set via papermill, or auto-detect from manifest" + ] }, { "cell_type": "code", @@ -27,14 +63,42 @@ ] }, "outputs": [], - "source": "display_sql(\"mempool_coverage\", target_date)" + "source": [ + "display_sql(\"mempool_availability\", target_date)" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "df = load_parquet(\"mempool_coverage\", target_date)\ndf[\"tx_type_label\"] = df[\"tx_type\"].map(TX_TYPE_LABELS)\ndf[\"coverage_pct\"] = df[\"seen_in_mempool\"] / df[\"total_txs\"] * 100\n\nprint(f\"Loaded {len(df):,} hour/type rows\")\nprint(f\"Hours: {df['hour'].nunique():,}\")\nprint(f\"Total transactions: {df['total_txs'].sum():,}\")" + "source": [ + "df = load_parquet(\"mempool_availability\", target_date)\n", + "df[\"tx_type_label\"] = df[\"tx_type\"].map(TX_TYPE_LABELS)\n", + "df[\"coverage_pct\"] = df[\"seen_before_slot\"] / df[\"total_txs\"] * 100\n", + "\n", + "# Calculate never seen (truly private)\n", + "df[\"never_seen\"] = df[\"total_txs\"] - df[\"seen_before_slot\"] - df[\"seen_after_slot\"]\n", + "\n", + "# Extract p50 age from percentiles array (index 0)\n", + "df[\"p50_age_ms\"] = df[\"age_percentiles_ms\"].apply(lambda x: x[0] if x is not None and len(x) > 0 else np.nan)\n", + "df[\"p50_age_s\"] = df[\"p50_age_ms\"] / 1000\n", + "\n", + "# Add hour column for time-series aggregation\n", + "df[\"hour\"] = df[\"slot_start_date_time\"].dt.floor(\"h\")\n", + "\n", + "total = df[\"total_txs\"].sum()\n", + "before = df[\"seen_before_slot\"].sum()\n", + "after = df[\"seen_after_slot\"].sum()\n", + "never = total - before - after\n", + "\n", + "print(f\"Loaded {len(df):,} slot/type rows\")\n", + "print(f\"Slots: {df['slot'].nunique():,}\")\n", + "print(f\"Total transactions: {total:,}\")\n", + "print(f\" Seen before slot: {before:,} ({100*before/total:.1f}%)\")\n", + "print(f\" Seen after slot: {after:,} ({100*after/total:.1f}%)\")\n", + "print(f\" Never seen: {never:,} ({100*never/total:.1f}%)\")" + ] }, { "cell_type": "markdown", @@ -42,7 +106,7 @@ "source": [ "## Coverage by transaction type\n", "\n", - "Summary of how many transactions were seen in the public mempool before block inclusion. Low coverage indicates private or MEV transactions that bypass the public mempool." + "Percentage of transactions seen in the public mempool *before* the slot they were included in. Low coverage indicates private or MEV transactions that bypass the public mempool or are submitted just-in-time." ] }, { @@ -54,14 +118,19 @@ "# Aggregate by type\n", "df_summary = df.groupby([\"tx_type\", \"tx_type_label\"]).agg({\n", " \"total_txs\": \"sum\",\n", - " \"seen_in_mempool\": \"sum\",\n", + " \"seen_before_slot\": \"sum\",\n", + " \"seen_after_slot\": \"sum\",\n", "}).reset_index()\n", - "df_summary[\"coverage_pct\"] = df_summary[\"seen_in_mempool\"] / df_summary[\"total_txs\"] * 100\n", + "df_summary[\"never_seen\"] = df_summary[\"total_txs\"] - df_summary[\"seen_before_slot\"] - df_summary[\"seen_after_slot\"]\n", + "df_summary[\"before_pct\"] = df_summary[\"seen_before_slot\"] / df_summary[\"total_txs\"] * 100\n", + "df_summary[\"after_pct\"] = df_summary[\"seen_after_slot\"] / df_summary[\"total_txs\"] * 100\n", + "df_summary[\"never_pct\"] = df_summary[\"never_seen\"] / df_summary[\"total_txs\"] * 100\n", "\n", "# Display summary table\n", - "summary_display = df_summary[[\"tx_type_label\", \"total_txs\", \"seen_in_mempool\", \"coverage_pct\"]].copy()\n", - "summary_display.columns = [\"Type\", \"Total\", \"Seen\", \"Coverage %\"]\n", - "summary_display[\"Coverage %\"] = summary_display[\"Coverage %\"].round(1)\n", + "summary_display = df_summary[[\"tx_type_label\", \"total_txs\", \"before_pct\", \"after_pct\", \"never_pct\"]].copy()\n", + "summary_display.columns = [\"Type\", \"Total\", \"Before slot %\", \"After slot %\", \"Never seen %\"]\n", + "for col in summary_display.columns[2:]:\n", + " summary_display[col] = summary_display[col].round(1)\n", "summary_display" ] }, @@ -71,20 +140,41 @@ "metadata": {}, "outputs": [], "source": [ - "# Coverage bar chart\n", - "fig = px.bar(\n", - " df_summary,\n", - " x=\"tx_type_label\",\n", - " y=\"coverage_pct\",\n", - " color=\"tx_type\",\n", - " color_discrete_map=TX_TYPE_COLORS,\n", - " labels={\"tx_type_label\": \"Transaction type\", \"coverage_pct\": \"Mempool visibility (%)\"},\n", - " text=\"coverage_pct\",\n", - ")\n", - "fig.update_traces(texttemplate=\"%{text:.1f}%\", textposition=\"outside\", showlegend=False)\n", + "# Coverage stacked bar chart showing before/after/never breakdown\n", + "fig = go.Figure()\n", + "\n", + "fig.add_trace(go.Bar(\n", + " x=df_summary[\"tx_type_label\"],\n", + " y=df_summary[\"before_pct\"],\n", + " name=\"Before slot (public)\",\n", + " marker_color=\"#27ae60\",\n", + " text=df_summary[\"before_pct\"].round(1),\n", + " textposition=\"inside\",\n", + "))\n", + "fig.add_trace(go.Bar(\n", + " x=df_summary[\"tx_type_label\"],\n", + " y=df_summary[\"after_pct\"],\n", + " name=\"After slot (propagated)\",\n", + " marker_color=\"#3498db\",\n", + " text=df_summary[\"after_pct\"].round(1),\n", + " textposition=\"inside\",\n", + "))\n", + "fig.add_trace(go.Bar(\n", + " x=df_summary[\"tx_type_label\"],\n", + " y=df_summary[\"never_pct\"],\n", + " name=\"Never seen (private)\",\n", + " marker_color=\"#95a5a6\",\n", + " text=df_summary[\"never_pct\"].round(1),\n", + " textposition=\"inside\",\n", + "))\n", + "\n", + "fig.update_traces(texttemplate=\"%{text:.1f}%\")\n", "fig.update_layout(\n", + " barmode=\"stack\",\n", " margin=dict(l=60, r=30, t=30, b=60),\n", - " yaxis=dict(range=[0, 105]),\n", + " xaxis=dict(title=\"Transaction type\"),\n", + " yaxis=dict(title=\"Percentage\", range=[0, 105]),\n", + " legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"left\", x=0),\n", " height=400,\n", ")\n", "fig.show(config={\"responsive\": True})" @@ -104,7 +194,31 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Data is already hourly from the query\nfig = px.line(\n df,\n x=\"hour\",\n y=\"coverage_pct\",\n color=\"tx_type_label\",\n color_discrete_map={v: TX_TYPE_COLORS[k] for k, v in TX_TYPE_LABELS.items()},\n labels={\"hour\": \"Time\", \"coverage_pct\": \"Mempool visibility (%)\", \"tx_type_label\": \"Type\"},\n markers=True,\n)\nfig.update_layout(\n margin=dict(l=60, r=30, t=30, b=60),\n legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"left\", x=0),\n height=400,\n)\nfig.show(config={\"responsive\": True})" + "source": [ + "# Aggregate to hourly for time-series\n", + "df_hourly = df.groupby([\"hour\", \"tx_type\", \"tx_type_label\"]).agg({\n", + " \"total_txs\": \"sum\",\n", + " \"seen_before_slot\": \"sum\",\n", + " \"seen_after_slot\": \"sum\",\n", + "}).reset_index()\n", + "df_hourly[\"coverage_pct\"] = df_hourly[\"seen_before_slot\"] / df_hourly[\"total_txs\"] * 100\n", + "\n", + "fig = px.line(\n", + " df_hourly,\n", + " x=\"hour\",\n", + " y=\"coverage_pct\",\n", + " color=\"tx_type_label\",\n", + " color_discrete_map={v: TX_TYPE_COLORS[k] for k, v in TX_TYPE_LABELS.items()},\n", + " labels={\"hour\": \"Time\", \"coverage_pct\": \"Seen before slot (%)\", \"tx_type_label\": \"Type\"},\n", + " markers=True,\n", + ")\n", + "fig.update_layout(\n", + " margin=dict(l=60, r=30, t=30, b=60),\n", + " legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"left\", x=0),\n", + " height=400,\n", + ")\n", + "fig.show(config={\"responsive\": True})" + ] }, { "cell_type": "markdown", @@ -120,19 +234,338 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Aggregate across types by hour (already hourly data)\ndf_volume = df.groupby(\"hour\").agg({\n \"total_txs\": \"sum\",\n \"seen_in_mempool\": \"sum\",\n}).reset_index()\ndf_volume[\"private_txs\"] = df_volume[\"total_txs\"] - df_volume[\"seen_in_mempool\"]\n\nfig = go.Figure()\nfig.add_trace(go.Bar(\n x=df_volume[\"hour\"],\n y=df_volume[\"seen_in_mempool\"],\n name=\"Public (seen in mempool)\",\n marker_color=\"#3498db\",\n))\nfig.add_trace(go.Bar(\n x=df_volume[\"hour\"],\n y=df_volume[\"private_txs\"],\n name=\"Private (not seen)\",\n marker_color=\"#95a5a6\",\n))\nfig.update_layout(\n barmode=\"stack\",\n margin=dict(l=60, r=30, t=30, b=60),\n xaxis=dict(title=\"Time\"),\n yaxis=dict(title=\"Transaction count\"),\n legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"left\", x=0),\n height=400,\n)\nfig.show(config={\"responsive\": True})" + "source": [ + "# Aggregate across types by hour - 3-way breakdown\n", + "df_volume = df.groupby(\"hour\").agg({\n", + " \"total_txs\": \"sum\",\n", + " \"seen_before_slot\": \"sum\",\n", + " \"seen_after_slot\": \"sum\",\n", + "}).reset_index()\n", + "df_volume[\"never_seen\"] = df_volume[\"total_txs\"] - df_volume[\"seen_before_slot\"] - df_volume[\"seen_after_slot\"]\n", + "\n", + "fig = go.Figure()\n", + "fig.add_trace(go.Bar(\n", + " x=df_volume[\"hour\"],\n", + " y=df_volume[\"seen_before_slot\"],\n", + " name=\"Before slot (public)\",\n", + " marker_color=\"#27ae60\",\n", + "))\n", + "fig.add_trace(go.Bar(\n", + " x=df_volume[\"hour\"],\n", + " y=df_volume[\"seen_after_slot\"],\n", + " name=\"After slot (propagated)\",\n", + " marker_color=\"#3498db\",\n", + "))\n", + "fig.add_trace(go.Bar(\n", + " x=df_volume[\"hour\"],\n", + " y=df_volume[\"never_seen\"],\n", + " name=\"Never seen (private)\",\n", + " marker_color=\"#95a5a6\",\n", + "))\n", + "fig.update_layout(\n", + " barmode=\"stack\",\n", + " margin=dict(l=60, r=30, t=30, b=60),\n", + " xaxis=dict(title=\"Time\"),\n", + " yaxis=dict(title=\"Transaction count\"),\n", + " legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"left\", x=0),\n", + " height=400,\n", + ")\n", + "fig.show(config={\"responsive\": True})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Coverage heatmap\n", + "\n", + "Heatmap showing mempool visibility over time for each transaction type. Darker colors indicate higher coverage (more transactions seen in the public mempool)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pivot for heatmap using hourly aggregated data\n", + "df_pivot = df_hourly.pivot(index=\"tx_type_label\", columns=\"hour\", values=\"coverage_pct\").fillna(0)\n", + "\n", + "fig = go.Figure(\n", + " data=go.Heatmap(\n", + " z=df_pivot.values,\n", + " x=df_pivot.columns,\n", + " y=df_pivot.index,\n", + " colorscale=\"Greens\",\n", + " colorbar=dict(title=dict(text=\"Coverage %\", side=\"right\")),\n", + " )\n", + ")\n", + "fig.update_layout(\n", + " margin=dict(l=100, r=30, t=30, b=60),\n", + " xaxis=dict(title=\"Time\"),\n", + " yaxis=dict(title=\"Transaction type\"),\n", + " height=300,\n", + ")\n", + "fig.show(config={\"responsive\": True})" + ] }, { "cell_type": "markdown", "metadata": {}, - "source": "## Coverage heatmap\n\nHeatmap showing mempool visibility over time for each transaction type. Darker colors indicate higher coverage (more transactions seen in the public mempool)." + "source": [ + "## Mempool age distribution\n", + "\n", + "How long transactions waited in the mempool before being included in a block. The age is measured from first observation in our sentries to the slot start time. Only transactions seen *before* their inclusion slot are counted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract all percentiles for each type\n", + "def extract_percentiles(group):\n", + " # Collect all non-null percentile arrays, weighted by seen_before_slot count\n", + " pct_arrays = []\n", + " for _, row in group.iterrows():\n", + " if row['seen_before_slot'] > 0 and row['age_percentiles_ms'] is not None:\n", + " pcts = row['age_percentiles_ms']\n", + " if not any(np.isnan(pcts)):\n", + " pct_arrays.append(pcts)\n", + " \n", + " if not pct_arrays:\n", + " return pd.Series({'p50': np.nan, 'p75': np.nan, 'p80': np.nan, 'p85': np.nan, 'p90': np.nan, 'p95': np.nan, 'p99': np.nan})\n", + " \n", + " # Average percentiles across slots (simple mean for now)\n", + " avg_pcts = np.nanmean(pct_arrays, axis=0)\n", + " return pd.Series({\n", + " 'p50': avg_pcts[0] / 1000,\n", + " 'p75': avg_pcts[1] / 1000,\n", + " 'p80': avg_pcts[2] / 1000,\n", + " 'p85': avg_pcts[3] / 1000,\n", + " 'p90': avg_pcts[4] / 1000,\n", + " 'p95': avg_pcts[5] / 1000,\n", + " 'p99': avg_pcts[6] / 1000,\n", + " })\n", + "\n", + "df_age = df.groupby(['tx_type', 'tx_type_label']).apply(extract_percentiles, include_groups=False).reset_index()\n", + "\n", + "# Display age table\n", + "age_display = df_age[['tx_type_label', 'p50', 'p75', 'p90', 'p95', 'p99']].copy()\n", + "age_display.columns = ['Type', 'p50 (s)', 'p75 (s)', 'p90 (s)', 'p95 (s)', 'p99 (s)']\n", + "for col in age_display.columns[1:]:\n", + " age_display[col] = age_display[col].round(1)\n", + "age_display" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Pivot for heatmap using hourly data directly\ndf_pivot = df.pivot(index=\"tx_type_label\", columns=\"hour\", values=\"coverage_pct\").fillna(0)\n\nfig = go.Figure(\n data=go.Heatmap(\n z=df_pivot.values,\n x=df_pivot.columns,\n y=df_pivot.index,\n colorscale=\"Greens\",\n colorbar=dict(title=dict(text=\"Coverage %\", side=\"right\")),\n )\n)\nfig.update_layout(\n margin=dict(l=100, r=30, t=30, b=60),\n xaxis=dict(title=\"Time\"),\n yaxis=dict(title=\"Transaction type\"),\n height=300,\n)\nfig.show(config={\"responsive\": True})" + "source": [ + "# Visualize age percentiles as line chart\n", + "df_age_long = df_age.melt(\n", + " id_vars=['tx_type', 'tx_type_label'],\n", + " value_vars=['p50', 'p75', 'p80', 'p85', 'p90', 'p95', 'p99'],\n", + " var_name='percentile',\n", + " value_name='age_s'\n", + ")\n", + "# Convert percentile labels to numeric for x-axis\n", + "df_age_long['pct_num'] = df_age_long['percentile'].str.replace('p', '').astype(int)\n", + "\n", + "fig = px.line(\n", + " df_age_long,\n", + " x='pct_num',\n", + " y='age_s',\n", + " color='tx_type_label',\n", + " color_discrete_map={v: TX_TYPE_COLORS[k] for k, v in TX_TYPE_LABELS.items()},\n", + " markers=True,\n", + " log_y=True,\n", + " labels={'pct_num': 'Percentile', 'age_s': 'Age (seconds)', 'tx_type_label': 'Type'},\n", + ")\n", + "fig.update_layout(\n", + " margin=dict(l=60, r=30, t=30, b=60),\n", + " xaxis=dict(tickvals=[50, 75, 80, 85, 90, 95, 99], ticktext=['p50', 'p75', 'p80', 'p85', 'p90', 'p95', 'p99']),\n", + " legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"left\", x=0),\n", + " height=400,\n", + ")\n", + "fig.show(config={\"responsive\": True})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Aggregate histogram buckets across all slots per tx type\n", + "hist_cols = [f'age_hist_{i}' for i in range(15)]\n", + "df_hist = df.groupby(['tx_type', 'tx_type_label'])[hist_cols].sum().reset_index()\n", + "\n", + "# Melt to long format for plotting\n", + "df_hist_long = df_hist.melt(\n", + " id_vars=['tx_type', 'tx_type_label'],\n", + " value_vars=hist_cols,\n", + " var_name='bucket',\n", + " value_name='count'\n", + ")\n", + "df_hist_long['bucket_idx'] = df_hist_long['bucket'].str.extract(r'(\\d+)').astype(int)\n", + "df_hist_long['bucket_label'] = df_hist_long['bucket_idx'].map(dict(enumerate(HIST_LABELS)))\n", + "\n", + "# Sort by bucket index for proper ordering\n", + "df_hist_long = df_hist_long.sort_values(['tx_type', 'bucket_idx'])\n", + "\n", + "fig = px.bar(\n", + " df_hist_long,\n", + " x='bucket_label',\n", + " y='count',\n", + " color='tx_type_label',\n", + " color_discrete_map={v: TX_TYPE_COLORS[k] for k, v in TX_TYPE_LABELS.items()},\n", + " facet_col='tx_type_label',\n", + " facet_col_wrap=2,\n", + " labels={'bucket_label': 'Age bucket', 'count': 'Count', 'tx_type_label': 'Type'},\n", + " category_orders={'bucket_label': HIST_LABELS},\n", + ")\n", + "fig.update_yaxes(matches=None, showticklabels=True)\n", + "fig.update_layout(\n", + " margin=dict(l=60, r=30, t=60, b=100),\n", + " showlegend=False,\n", + " height=600,\n", + ")\n", + "fig.update_xaxes(tickangle=45)\n", + "fig.for_each_annotation(lambda a: a.update(text=a.text.split(\"=\")[-1]))\n", + "fig.show(config={\"responsive\": True})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Propagation delay (seen after slot)\n", + "\n", + "For transactions first seen in the mempool *after* block inclusion, this measures how long after the slot start they appeared." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extract delay percentiles for transactions seen AFTER slot start\n", + "def extract_delay_percentiles(group):\n", + " pct_arrays = []\n", + " for _, row in group.iterrows():\n", + " if row['seen_after_slot'] > 0 and row['delay_percentiles_ms'] is not None:\n", + " pcts = row['delay_percentiles_ms']\n", + " if not any(np.isnan(pcts)):\n", + " pct_arrays.append(pcts)\n", + " \n", + " if not pct_arrays:\n", + " return pd.Series({'p50': np.nan, 'p75': np.nan, 'p80': np.nan, 'p85': np.nan, 'p90': np.nan, 'p95': np.nan, 'p99': np.nan})\n", + " \n", + " avg_pcts = np.nanmean(pct_arrays, axis=0)\n", + " return pd.Series({\n", + " 'p50': avg_pcts[0] / 1000,\n", + " 'p75': avg_pcts[1] / 1000,\n", + " 'p80': avg_pcts[2] / 1000,\n", + " 'p85': avg_pcts[3] / 1000,\n", + " 'p90': avg_pcts[4] / 1000,\n", + " 'p95': avg_pcts[5] / 1000,\n", + " 'p99': avg_pcts[6] / 1000,\n", + " })\n", + "\n", + "df_delay = df.groupby(['tx_type', 'tx_type_label']).apply(extract_delay_percentiles, include_groups=False).reset_index()\n", + "\n", + "# Display delay table\n", + "delay_display = df_delay[['tx_type_label', 'p50', 'p75', 'p90', 'p95', 'p99']].copy()\n", + "delay_display.columns = ['Type', 'p50 (s)', 'p75 (s)', 'p90 (s)', 'p95 (s)', 'p99 (s)']\n", + "for col in delay_display.columns[1:]:\n", + " delay_display[col] = delay_display[col].round(2)\n", + "delay_display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize delay percentiles as line chart\n", + "df_delay_long = df_delay.melt(\n", + " id_vars=['tx_type', 'tx_type_label'],\n", + " value_vars=['p50', 'p75', 'p80', 'p85', 'p90', 'p95', 'p99'],\n", + " var_name='percentile',\n", + " value_name='delay_s'\n", + ")\n", + "# Convert percentile labels to numeric for x-axis\n", + "df_delay_long['pct_num'] = df_delay_long['percentile'].str.replace('p', '').astype(int)\n", + "\n", + "fig = px.line(\n", + " df_delay_long,\n", + " x='pct_num',\n", + " y='delay_s',\n", + " color='tx_type_label',\n", + " color_discrete_map={v: TX_TYPE_COLORS[k] for k, v in TX_TYPE_LABELS.items()},\n", + " markers=True,\n", + " log_y=True,\n", + " labels={'pct_num': 'Percentile', 'delay_s': 'Delay (seconds)', 'tx_type_label': 'Type'},\n", + ")\n", + "fig.update_layout(\n", + " margin=dict(l=60, r=30, t=30, b=60),\n", + " xaxis=dict(tickvals=[50, 75, 80, 85, 90, 95, 99], ticktext=['p50', 'p75', 'p80', 'p85', 'p90', 'p95', 'p99']),\n", + " legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"left\", x=0),\n", + " height=400,\n", + ")\n", + "fig.show(config={\"responsive\": True})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Aggregate delay histogram buckets across all slots per tx type\n", + "delay_hist_cols = [f'delay_hist_{i}' for i in range(15)]\n", + "df_delay_hist = df.groupby(['tx_type', 'tx_type_label'])[delay_hist_cols].sum().reset_index()\n", + "\n", + "# Melt to long format for plotting\n", + "df_delay_hist_long = df_delay_hist.melt(\n", + " id_vars=['tx_type', 'tx_type_label'],\n", + " value_vars=delay_hist_cols,\n", + " var_name='bucket',\n", + " value_name='count'\n", + ")\n", + "df_delay_hist_long['bucket_idx'] = df_delay_hist_long['bucket'].str.extract(r'(\\d+)').astype(int)\n", + "df_delay_hist_long['bucket_label'] = df_delay_hist_long['bucket_idx'].map(dict(enumerate(HIST_LABELS)))\n", + "\n", + "# Sort by bucket index for proper ordering\n", + "df_delay_hist_long = df_delay_hist_long.sort_values(['tx_type', 'bucket_idx'])\n", + "\n", + "fig = px.bar(\n", + " df_delay_hist_long,\n", + " x='bucket_label',\n", + " y='count',\n", + " color='tx_type_label',\n", + " color_discrete_map={v: TX_TYPE_COLORS[k] for k, v in TX_TYPE_LABELS.items()},\n", + " facet_col='tx_type_label',\n", + " facet_col_wrap=2,\n", + " labels={'bucket_label': 'Delay bucket', 'count': 'Count', 'tx_type_label': 'Type'},\n", + " category_orders={'bucket_label': HIST_LABELS},\n", + ")\n", + "fig.update_yaxes(matches=None, showticklabels=True)\n", + "fig.update_layout(\n", + " margin=dict(l=60, r=30, t=60, b=100),\n", + " showlegend=False,\n", + " height=600,\n", + ")\n", + "fig.update_xaxes(tickangle=45)\n", + "fig.for_each_annotation(lambda a: a.update(text=a.text.split(\"=\")[-1]))\n", + "fig.show(config={\"responsive\": True})" + ] }, { "cell_type": "markdown", @@ -199,4 +632,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/pipeline.yaml b/pipeline.yaml index 07af0c6..99d1dc2 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -88,6 +88,12 @@ queries: description: Per-sentry mempool coverage rates output_file: sentry_coverage.parquet + mempool_availability: + module: queries.mempool_visibility + function: fetch_mempool_availability + description: Per-slot mempool availability with age percentiles + output_file: mempool_availability.parquet + block_production_timeline: module: queries.block_production_timeline function: fetch_block_production_timeline @@ -153,6 +159,7 @@ notebooks: - tx_per_slot - mempool_coverage - sentry_coverage + - mempool_availability parameters: - name: target_date type: date diff --git a/queries/mempool_visibility.py b/queries/mempool_visibility.py index 4b7435f..5ae013b 100644 --- a/queries/mempool_visibility.py +++ b/queries/mempool_visibility.py @@ -113,3 +113,117 @@ def fetch_sentry_coverage( df = client.query_df(query) return df, query + + +def fetch_mempool_availability( + client, + target_date: str, + network: str = "mainnet", +) -> tuple: + """Fetch per-slot mempool availability with age percentiles and histograms. + + Categorizes transactions into: + - seen_before_slot: Available in mempool before inclusion (public) + - seen_after_slot: First appeared in mempool after block propagation + - neither: Truly private (never seen in mempool) + + Returns per slot per tx type: + - age/delay percentiles (p50, p75, p80, p85, p90, p95, p99) + - age/delay histograms (log2 buckets in seconds) + + Histogram buckets (log2 seconds): + 0: <0.5s, 1: 0.5-1s, 2: 1-2s, 3: 2-4s, 4: 4-8s, 5: 8-16s, + 6: 16-32s, 7: 32-64s, 8: 64-128s, 9: 128-256s, 10: 256-512s, 11: >=512s + + Returns (df, query). + """ + date_filter = _get_date_filter(target_date) + + # Define reusable condition fragments + seen_before = """ + m.first_event_time IS NOT NULL + AND m.first_event_time > '2020-01-01' + AND m.first_event_time < c.slot_start_date_time""" + seen_after = """ + m.first_event_time IS NOT NULL + AND m.first_event_time > '2020-01-01' + AND m.first_event_time >= c.slot_start_date_time""" + + # Age = time from first seen to slot start (for seen_before) + age_ms = "dateDiff('millisecond', m.first_event_time, c.slot_start_date_time)" + # Delay = time from slot start to first seen (for seen_after) + delay_ms = "dateDiff('millisecond', c.slot_start_date_time, m.first_event_time)" + + # Log2 bucket boundaries in milliseconds (up to 1 hour) + # Buckets: <0.5s, 0.5-1s, 1-2s, 2-4s, 4-8s, 8-16s, 16-32s, 32-64s (32s-1m), + # 64-128s (1-2m), 128-256s (2-4m), 256-512s (4-8m), 512-1024s (8-17m), + # 1024-2048s (17-34m), 2048-3600s (34-60m), >=3600s (>=1h) + bounds_ms = [500, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 256000, 512000, 1024000, 2048000, 3600000] + + # Generate histogram countIf expressions + def hist_columns(value_expr: str, condition: str, prefix: str) -> str: + cols = [] + # Bucket 0: < 0.5s + cols.append(f"countIf({value_expr} < {bounds_ms[0]} AND {condition}) AS {prefix}_0") + # Buckets 1-10: range buckets + for i in range(len(bounds_ms) - 1): + cols.append( + f"countIf({value_expr} >= {bounds_ms[i]} AND {value_expr} < {bounds_ms[i+1]} AND {condition}) AS {prefix}_{i+1}" + ) + # Bucket 11: >= 512s + cols.append(f"countIf({value_expr} >= {bounds_ms[-1]} AND {condition}) AS {prefix}_{len(bounds_ms)}") + return ",\n ".join(cols) + + age_hist = hist_columns(age_ms, seen_before, "age_hist") + delay_hist = hist_columns(delay_ms, seen_after, "delay_hist") + + query = f""" +WITH first_seen AS ( + SELECT + hash, + min(event_date_time) AS first_event_time + FROM mempool_transaction + WHERE meta_network_name = '{network}' + AND event_date_time >= '{target_date}'::date - INTERVAL 1 DAY + AND event_date_time < '{target_date}'::date + INTERVAL 2 DAY + GROUP BY hash +) +SELECT + c.slot, + c.slot_start_date_time, + c.type AS tx_type, + count() AS total_txs, + -- Seen BEFORE slot start (public, available for inclusion) + countIf({seen_before}) AS seen_before_slot, + -- Seen AFTER slot start (appeared after block propagation) + countIf({seen_after}) AS seen_after_slot, + -- Age percentiles for transactions seen BEFORE (how long in mempool) + quantilesIf(0.50, 0.75, 0.80, 0.85, 0.90, 0.95, 0.99)( + {age_ms}, {seen_before} + ) AS age_percentiles_ms, + -- Delay percentiles for transactions seen AFTER (propagation delay) + quantilesIf(0.50, 0.75, 0.80, 0.85, 0.90, 0.95, 0.99)( + {delay_ms}, {seen_after} + ) AS delay_percentiles_ms, + -- Age histogram (log2 buckets in seconds) + {age_hist}, + -- Delay histogram (log2 buckets in seconds) + {delay_hist} +FROM canonical_beacon_block_execution_transaction c +GLOBAL LEFT JOIN first_seen m ON c.hash = m.hash +WHERE c.meta_network_name = '{network}' + AND {date_filter} +GROUP BY c.slot, c.slot_start_date_time, c.type +ORDER BY c.slot, c.type +""" + + df = client.query_df(query) + return df, query + + +# Histogram bucket labels for visualization +AGE_HIST_LABELS = [ + "<0.5s", "0.5-1s", "1-2s", "2-4s", "4-8s", "8-16s", + "16-32s", "32s-1m", "1-2m", "2-4m", "4-8m", "8-17m", + "17-34m", "34-60m", ">=1h" +]