Merge pull request #12 from project-aethermesh/feat/use-public-endpoints

sanbotto · web-flow · commit 831265ff31c3 · 2025-09-03T19:08:37.000-03:00
feat: introduce a new public-first load-balancing strategy
diff --git a/.env.example b/.env.example
@@ -50,6 +50,10 @@ PROXY_MAX_RETRIES=3
 PROXY_TIMEOUT=15
 ## Timeout per individual retry attempt in seconds
 PROXY_TIMEOUT_PER_TRY=5
+## Prioritize public endpoints over primary endpoints (true/false)
+PUBLIC_FIRST=false
+## Number of attempts to make at public endpoints before trying primary/fallback
+PUBLIC_FIRST_ATTEMPTS=2
 
 # Logging Configuration
 ## Log level: debug, info, warn, error, fatal, panic
diff --git a/README.md b/README.md
@@ -5,7 +5,8 @@ A lightweight, low-latency RPC load balancer written in Go. It is designed to ma
 ## Features
 
 - **Round-Robin Load Balancing**: Distributes requests to available endpoints in a round-robin manner, prioritizing those with fewer requests in the last 24 hours.
-- **Intelligent Retry Logic**: Configurable retry attempts with priority-based endpoint selection (first primary endpoints, then fallbacks).
+- **Intelligent Retry Logic**: Configurable retry attempts with priority-based endpoint selection (primary endpoints, fallbacks, and optional public-first mode).
+- **Public-First Mode**: Optional prioritization of public RPC endpoints to reduce costs while maintaining reliability.
 - **Flexible Timeout Control**: Separate timeouts for overall requests and individual retry attempts.
 - **Rate Limit Recovery**: Safe rate limit detection and recovery with exponential backoff strategies per endpoint, to avoid making things worse when a provider is rate-limiting you.
 - **Health Checks**: Regularly checks the health of upstream endpoints and updates their status in Redis.
@@ -109,10 +110,13 @@ The load balancer implements intelligent retry logic with configurable timeouts:
 
 ### How Retries Work
 
-1. **Priority-based selection**: Always tries primary endpoints first, then fallbacks.
+1. **Priority-based selection**: Endpoint selection follows these priorities:
+   - **Normal mode**: primary → fallback → public
+   - **Public-first mode** (`PUBLIC_FIRST=true`): public → primary → fallback
 2. **Configurable attempts**: Retries up to `PROXY_MAX_RETRIES` times.
-3. **Endpoint rotation**: Removes failed endpoints from the retry pool to avoid repeated failures.
-4. **Dual timeout control**: There are 2 settings that control how long requests take:
+3. **Public endpoint limiting**: When `PUBLIC_FIRST=true`, attempts to reach public endpoints are limited to the value of `PUBLIC_FIRST_ATTEMPTS`, after which the proxy tries using a primary or fallback endpoint.
+4. **Endpoint rotation**: Removes failed endpoints from the retry pool to avoid repeated failures.
+5. **Dual timeout control**: There are 2 settings that control how long requests take:
    - **Total request timeout** (`PROXY_TIMEOUT`): Maximum time for the entire request (this is what the end user "sees").
    - **Per-try timeout** (`PROXY_TIMEOUT_PER_TRY`): Maximum time per individual request sent from the proxy to each endpoint.
 
@@ -141,6 +145,8 @@ The load balancer implements intelligent retry logic with configurable timeouts:
 | `--proxy-retries`                      | `3`                                                             | Maximum number of retries for proxy requests                                             |
 | `--proxy-timeout`                      | `15`                                                            | Total timeout for proxy requests in seconds                                              |
 | `--proxy-timeout-per-try`              | `5`                                                             | Timeout per individual retry attempt in seconds                                          |
+| `--public-first`                       | `false`                                                         | Prioritize public endpoints over primary endpoints                                       |
+| `--public-first-attempts`              | `2`                                                             | Number of attempts to make at public endpoints before trying primary/fallback           |
 | `--redis-host`                         | `localhost`                                                     | Redis server hostname                                                                    |
 | `--redis-pass`                         | -                                                               | Redis server password                                                                    |
 | `--redis-port`                         | `6379`                                                          | Redis server port                                                                        |
@@ -170,6 +176,8 @@ The load balancer implements intelligent retry logic with configurable timeouts:
 | `PROXY_MAX_RETRIES`                  | `3`                                                             | Maximum number of retries for proxy requests                                                                                                                                      |
 | `PROXY_TIMEOUT`                      | `15`                                                            | Total timeout for proxy requests in seconds                                                                                                                                       |
 | `PROXY_TIMEOUT_PER_TRY`              | `5`                                                             | Timeout per individual retry attempt in seconds                                                                                                                                   |
+| `PUBLIC_FIRST`                       | `false`                                                         | Prioritize public endpoints over primary and fallback endpoints                                                                                                                                |
+| `PUBLIC_FIRST_ATTEMPTS`              | `2`                                                             | Number of attempts to make at public endpoints before trying with a primary/fallback                                                                                                     |
 | `REDIS_HOST`                         | `localhost`                                                     | Redis server hostname                                                                                                                                                             |
 | `REDIS_PASS`                         | -                                                               | Redis server password                                                                                                                                                             |
 | `REDIS_PORT`                         | `6379`                                                          | Redis server port                                                                                                                                                                 |
@@ -215,6 +223,42 @@ For production deployments with multiple load balancer pods, use the standalone
 - **Resource Efficiency**: Reduces RPC endpoint usage
 - **Better Separation of Concerns**: Health monitoring isolated from request handling
 
+## Public-First Mode
+
+Ætherlay supports a "public-first" mode that prioritizes public RPC endpoints over primary and fallback endpoints to help reduce costs while maintaining reliability.
+
+### How Public-First Mode Works
+
+1. **Enable public-first**: Set `PUBLIC_FIRST=true` (or use the `--public-first` CLI flag)
+2. **Configure attempts**: Set `PUBLIC_FIRST_ATTEMPTS` to control how many public endpoints to try (default: 2)
+3. **Endpoint hierarchy**: 
+   - **When enabled**: public → primary → fallback
+   - **When disabled**: primary → fallback → public
+
+### Configuration Example
+
+In your `endpoints.json`, mark endpoints with `"role": "public"`:
+
+```json
+{
+  "mainnet": {
+    "publicnode-1": {
+      "provider": "publicnode",
+      "role": "public",
+      "type": "archive",
+      "http_url": "https://ethereum-rpc.publicnode.com",
+      "ws_url": "wss://ethereum-rpc.publicnode.com"
+    },
+    "alchemy-1": {
+      "provider": "alchemy",
+      "role": "primary",
+      "type": "archive",
+      "http_url": "https://eth-mainnet.g.alchemy.com/v2/${ALCHEMY_API_KEY}"
+    }
+  }
+}
+```
+
 ## Rate Limit Recovery
 
 Ætherlay includes intelligent rate limit detection and recovery mechanisms to handle upstream provider rate limits gracefully. This system automatically detects when endpoints are rate-limited and implements recovery strategies to restore service.
diff --git a/configs/endpoints-example.json b/configs/endpoints-example.json
@@ -2,7 +2,7 @@
   "mainnet": {
     "llama-1": {
       "provider": "llama",
-      "role": "primary",
+      "role": "public",
       "type": "full",
       "http_url": "https://eth.llamarpc.com",
       "rate_limit_recovery": {
@@ -16,7 +16,7 @@
     },
     "drpc-1": {
       "provider": "drpc",
-      "role": "fallback",
+      "role": "primary",
       "type": "full",
       "http_url": "https://eth.drpc.org",
       "ws_url": "wss://eth.drpc.org"
@@ -32,13 +32,13 @@
   "arbitrum": {
     "drpc-1": {
       "provider": "drpc",
-      "role": "primary",
+      "role": "public",
       "type": "full",
       "http_url": "https://arbitrum.drpc.org"
     },
     "publicnode-1": {
       "provider": "public_node",
-      "role": "fallback",
+      "role": "public",
       "type": "archive",
       "http_url": "https://arbitrum-one-rpc.publicnode.com",
       "ws_url": "wss://arbitrum-one-rpc.publicnode.com"
@@ -47,7 +47,7 @@
   "base": {
     "alchemy-test": {
       "provider": "alchemy",
-      "role": "primary",
+      "role": "public",
       "type": "archive",
       "http_url": "https://base-mainnet.g.alchemy.com/v2/${ALCHEMY_API_KEY}",
       "ws_url": "wss://base-mainnet.g.alchemy.com/v2/${ALCHEMY_API_KEY}",
@@ -62,7 +62,7 @@
     },
     "infura-staging": {
       "provider": "infura",
-      "role": "primary",
+      "role": "fallback",
       "type": "archive",
       "http_url": "https://base-mainnet.infura.io/v3/${INFURA_API_KEY}",
       "ws_url": "wss://base-mainnet.infura.io/ws/v3/${INFURA_API_KEY}",
diff --git a/internal/config/config.go b/internal/config/config.go
@@ -119,6 +119,24 @@ func (c *Config) GetFallbackEndpoints(chain string) []Endpoint {
 	return fallbackEndpoints
 }
 
+// GetPublicEndpoints returns all public endpoints for a chain.
+// Public endpoints are free/public RPC nodes that can be prioritized when PUBLIC_FIRST is enabled.
+// Returns nil if the chain doesn't exist or has no public endpoints.
+func (c *Config) GetPublicEndpoints(chain string) []Endpoint {
+	endpoints, exists := c.Endpoints[chain]
+	if !exists {
+		return nil
+	}
+
+	var publicEndpoints []Endpoint
+	for _, endpoint := range endpoints {
+		if endpoint.Role == "public" {
+			publicEndpoints = append(publicEndpoints, endpoint)
+		}
+	}
+	return publicEndpoints
+}
+
 // DefaultRateLimitRecovery returns the default rate limit recovery configuration
 func DefaultRateLimitRecovery() RateLimitRecovery {
 	return RateLimitRecovery{
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
@@ -33,8 +33,8 @@ func TestLoadConfig(t *testing.T) {
 		t.Errorf("Expected provider 'llama', got '%s'", llamaEndpoint.Provider)
 	}
 
-	if llamaEndpoint.Role != "primary" {
-		t.Errorf("Expected role 'primary', got '%s'", llamaEndpoint.Role)
+	if llamaEndpoint.Role != "public" {
+		t.Errorf("Expected role 'public', got '%s'", llamaEndpoint.Role)
 	}
 }
 
diff --git a/internal/helpers/helpers.go b/internal/helpers/helpers.go
@@ -25,6 +25,8 @@ type Config struct {
 	ProxyMaxRetries                 int
 	ProxyTimeout                    int
 	ProxyTimeoutPerTry              int
+	PublicFirst                     bool
+	PublicFirstAttempts             int
 	RedisHost                       string
 	RedisPass                       string
 	RedisPort                       string
@@ -52,6 +54,8 @@ func ParseFlags() *Config {
 	flag.IntVar(&config.ProxyMaxRetries, "proxy-retries", 3, "Maximum number of retries for proxy requests")
 	flag.IntVar(&config.ProxyTimeout, "proxy-timeout", 15, "Timeout for proxy requests in seconds")
 	flag.IntVar(&config.ProxyTimeoutPerTry, "proxy-timeout-per-try", 5, "Timeout per individual retry attempt in seconds")
+	flag.BoolVar(&config.PublicFirst, "public-first", false, "Prioritize public endpoints over primary endpoints")
+	flag.IntVar(&config.PublicFirstAttempts, "public-first-attempts", 2, "Number of attempts to make at public endpoints before trying primary/fallback")
 	flag.StringVar(&config.RedisHost, "redis-host", "localhost", "Redis host")
 	flag.StringVar(&config.RedisPass, "redis-pass", "", "Redis password")
 	flag.StringVar(&config.RedisPort, "redis-port", "6379", "Redis port")
@@ -117,6 +121,8 @@ func (c *Config) LoadConfiguration() *LoadedConfig {
 		ProxyMaxRetries:                 c.GetIntValue("proxy-retries", c.ProxyMaxRetries, "PROXY_MAX_RETRIES", 3),
 		ProxyTimeout:                    c.GetIntValue("proxy-timeout", c.ProxyTimeout, "PROXY_TIMEOUT", 15),
 		ProxyTimeoutPerTry:              c.GetIntValue("proxy-timeout-per-try", c.ProxyTimeoutPerTry, "PROXY_TIMEOUT_PER_TRY", 5),
+		PublicFirst:                     c.GetBoolValue("public-first", c.PublicFirst, "PUBLIC_FIRST", false),
+		PublicFirstAttempts:             c.GetIntValue("public-first-attempts", c.PublicFirstAttempts, "PUBLIC_FIRST_ATTEMPTS", 2),
 		RedisHost:                       c.GetStringValue("redis-host", c.RedisHost, "REDIS_HOST", "localhost"),
 		RedisPass:                       c.GetStringValue("redis-pass", c.RedisPass, "REDIS_PASS", ""),
 		RedisPort:                       c.GetStringValue("redis-port", c.RedisPort, "REDIS_PORT", "6379"),
@@ -142,6 +148,8 @@ type LoadedConfig struct {
 	ProxyMaxRetries                 int
 	ProxyTimeout                    int
 	ProxyTimeoutPerTry              int
+	PublicFirst                     bool
+	PublicFirstAttempts             int
 	RedisHost                       string
 	RedisPass                       string
 	RedisPort                       string
diff --git a/internal/server/server.go b/internal/server/server.go
diff --git a/internal/server/server_test.go b/internal/server/server_test.go
diff --git a/services/health-checker/main.go b/services/health-checker/main.go

Original file line number	Diff line number	Diff line change
`@@ -33,8 +33,8 @@ func TestLoadConfig(t *testing.T) {`
`33`	`33`	`t.Errorf("Expected provider 'llama', got '%s'", llamaEndpoint.Provider)`
`34`	`34`	`}`
`35`	`35`
`36`		`- if llamaEndpoint.Role != "primary" {`
`37`		`- t.Errorf("Expected role 'primary', got '%s'", llamaEndpoint.Role)`
	`36`	`+ if llamaEndpoint.Role != "public" {`
	`37`	`+ t.Errorf("Expected role 'public', got '%s'", llamaEndpoint.Role)`
`38`	`38`	`}`
`39`	`39`	`}`
`40`	`40`