Skip to content

Commit a452f5c

Browse files
RD-5176 - don't retry to send after timeout to edge (#213)
* don't retry to send after timeout to edge
1 parent c3c1ada commit a452f5c

File tree

3 files changed

+76
-18
lines changed

3 files changed

+76
-18
lines changed

src/lumigo_tracer/lumigo_utils.py

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import datetime
12
import decimal
23
import base64
34
import hashlib
@@ -9,6 +10,7 @@
910
from functools import reduce, lru_cache
1011
import time
1112
import http.client
13+
import socket
1214
from collections import OrderedDict
1315
import random
1416
from typing import Union, List, Optional, Dict, Any, Tuple, Pattern, TypeVar
@@ -33,6 +35,7 @@
3335
HTTPS_PREFIX = "https://"
3436
LOG_FORMAT = "#LUMIGO# - %(asctime)s - %(levelname)s - %(message)s"
3537
SECONDS_TO_TIMEOUT = 0.5
38+
COOLDOWN_AFTER_TIMEOUT_DURATION = datetime.timedelta(seconds=10)
3639
LUMIGO_EVENT_KEY = "_lumigo"
3740
STEP_FUNCTION_UID_KEY = "step_function_uid"
3841
# number of spans that are too big to enter the reported message before break
@@ -81,7 +84,6 @@
8184

8285
edge_kinesis_boto_client = None
8386
edge_connection = None
84-
internal_error_already_logged = False
8587

8688

8789
def should_use_tracer_extension() -> bool:
@@ -92,18 +94,25 @@ def get_region() -> str:
9294
return os.environ.get("AWS_REGION") or "UNKNOWN"
9395

9496

95-
try:
96-
# Try to establish the connection in initialization
97-
if (
98-
os.environ.get("LUMIGO_INITIALIZATION_CONNECTION", "").lower() != "false"
99-
and get_region() != CHINA_REGION # noqa
100-
):
101-
edge_connection = http.client.HTTPSConnection( # type: ignore
102-
EDGE_HOST.format(region=os.environ.get("AWS_REGION")), timeout=EDGE_TIMEOUT
103-
)
104-
edge_connection.connect()
105-
except Exception:
106-
pass
97+
class InternalState:
98+
timeout_on_connection: Optional[datetime.datetime] = None
99+
internal_error_already_logged = False
100+
101+
@staticmethod
102+
def reset():
103+
InternalState.timeout_on_connection = None
104+
InternalState.internal_error_already_logged = False
105+
106+
@staticmethod
107+
def mark_timeout_to_edge():
108+
InternalState.timeout_on_connection = datetime.datetime.now()
109+
110+
@staticmethod
111+
def should_report_to_edge() -> bool:
112+
if not InternalState.timeout_on_connection:
113+
return True
114+
time_diff = datetime.datetime.now() - InternalState.timeout_on_connection
115+
return time_diff > COOLDOWN_AFTER_TIMEOUT_DURATION
107116

108117

109118
class Configuration:
@@ -275,8 +284,10 @@ def _create_request_body(
275284
return aws_dump(spans_to_send)[:max_size]
276285

277286

278-
def establish_connection(host):
287+
def establish_connection(host=None):
279288
try:
289+
if not host:
290+
host = get_edge_host(os.environ.get("AWS_REGION"))
280291
return http.client.HTTPSConnection(host, timeout=EDGE_TIMEOUT)
281292
except Exception as e:
282293
get_logger().exception(f"Could not establish connection to {host}", exc_info=e)
@@ -303,6 +314,9 @@ def report_json(region: Optional[str], msgs: List[dict], should_retry: bool = Tr
303314
:return: The duration of reporting (in milliseconds),
304315
or 0 if we didn't send (due to configuration or fail).
305316
"""
317+
if not InternalState.should_report_to_edge():
318+
get_logger().info("Skip sending messages due to previous timeout")
319+
return 0
306320
if not Configuration.should_report:
307321
return 0
308322
get_logger().info(f"reporting the messages: {msgs[:10]}")
@@ -338,6 +352,10 @@ def report_json(region: Optional[str], msgs: List[dict], should_retry: bool = Tr
338352
response.read() # We most read the response to keep the connection available
339353
duration = int((time.time() - start_time) * 1000)
340354
get_logger().info(f"successful reporting, code: {getattr(response, 'code', 'unknown')}")
355+
except socket.timeout:
356+
get_logger().exception(f"Timeout while connecting to {host}")
357+
InternalState.mark_timeout_to_edge()
358+
internal_analytics_message("report: socket.timeout")
341359
except Exception as e:
342360
if should_retry:
343361
get_logger().exception(f"Could not report to {host}. Retrying.", exc_info=e)
@@ -529,12 +547,11 @@ def warn_client(msg: str) -> None:
529547

530548

531549
def internal_analytics_message(msg: str, force: bool = False) -> None:
532-
global internal_error_already_logged
533550
if os.environ.get("LUMIGO_ANALYTICS") != "off":
534-
if force or not internal_error_already_logged:
551+
if force or not InternalState.internal_error_already_logged:
535552
b64_message = base64.b64encode(msg.encode()).decode()
536553
print(f"{INTERNAL_ANALYTICS_PREFIX}: {b64_message}")
537-
internal_error_already_logged = True
554+
InternalState.internal_error_already_logged = True
538555

539556

540557
def is_api_gw_event(event: dict) -> bool:
@@ -753,3 +770,17 @@ def is_provision_concurrency_initialization() -> bool:
753770
def get_stacktrace(exception: Exception) -> str:
754771
original_traceback = traceback.format_tb(exception.__traceback__)
755772
return "".join(filter(lambda line: STACKTRACE_LINE_TO_DROP not in line, original_traceback))
773+
774+
775+
try:
776+
# Try to establish the connection in initialization
777+
if (
778+
os.environ.get("LUMIGO_INITIALIZATION_CONNECTION", "").lower() != "false"
779+
and get_region() != CHINA_REGION # noqa
780+
):
781+
edge_connection = establish_connection()
782+
edge_connection.connect()
783+
except socket.timeout:
784+
InternalState.mark_timeout_to_edge()
785+
except Exception:
786+
pass

src/test/conftest.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,13 @@
88
import pytest
99

1010
from lumigo_tracer import lumigo_utils
11-
from lumigo_tracer.lumigo_utils import Configuration, get_omitting_regex, get_logger, get_edge_host
11+
from lumigo_tracer.lumigo_utils import (
12+
Configuration,
13+
get_omitting_regex,
14+
get_logger,
15+
get_edge_host,
16+
InternalState,
17+
)
1218
from lumigo_tracer.spans_container import SpansContainer
1319
from lumigo_tracer.wrappers.http.http_data_classes import HttpState
1420

@@ -42,6 +48,7 @@ def restart_global_span():
4248
yield
4349
SpansContainer._span = None
4450
HttpState.clear()
51+
InternalState.reset()
4552

4653

4754
@pytest.yield_fixture(autouse=True)

src/test/unit/test_lumigo_utils.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from decimal import Decimal
77
import datetime
88
import http.client
9+
import socket
910

1011
import boto3
1112
from mock import Mock, MagicMock
@@ -43,6 +44,7 @@
4344
CHINA_REGION,
4445
internal_analytics_message,
4546
INTERNAL_ANALYTICS_PREFIX,
47+
InternalState,
4648
)
4749
import json
4850

@@ -479,6 +481,24 @@ def test_report_json_retry(monkeypatch, reporter_mock, caplog, errors, final_log
479481
assert caplog.records[-1].levelname == final_log
480482

481483

484+
def test_report_json_fast_failure_after_timeout(monkeypatch, reporter_mock, caplog):
485+
reporter_mock.side_effect = report_json
486+
monkeypatch.setattr(Configuration, "host", "host")
487+
monkeypatch.setattr(Configuration, "should_report", True)
488+
monkeypatch.setattr(http.client, "HTTPSConnection", Mock())
489+
http.client.HTTPSConnection("force_reconnect").getresponse.side_effect = socket.timeout
490+
491+
assert report_json(None, [{"a": "b"}]) == 0
492+
assert caplog.records[-1].msg == "Timeout while connecting to host"
493+
494+
assert report_json(None, [{"a": "b"}]) == 0
495+
assert caplog.records[-1].msg == "Skip sending messages due to previous timeout"
496+
497+
InternalState.timeout_on_connection = datetime.datetime(2016, 1, 1)
498+
assert report_json(None, [{"a": "b"}]) == 0
499+
assert caplog.records[-1].msg == "Timeout while connecting to host"
500+
501+
482502
def test_report_json_china_missing_access_key_id(monkeypatch, reporter_mock, caplog):
483503
monkeypatch.setattr(Configuration, "should_report", True)
484504
reporter_mock.side_effect = report_json

0 commit comments

Comments
 (0)