From 79385864ae59283328910db404053cb17098380c Mon Sep 17 00:00:00 2001
From: DanielBotnik <danielbotnik@gmail.com>
Date: Fri, 29 May 2026 01:16:41 +0300
Subject: [PATCH] Deflake test_linux_network_stacksmash_64

The test was wrapped in @flaky(max_runs=3) to paper over several timing races
in driving the network exploit. Fix the underlying races and drop the wrapper.

Root cause of the CI failure (exploit subprocess times out, empty stdout):
the target services a connection with a single recv(), but the generated
call_shellcode exploit sends the shellcode payload and the follow-up shell
commands (e.g. "echo hello\n", "exit\n") back to back. Under load these
coalesce into that one recv(), so the commands are consumed before the shell
is up, leaving the popped shell to block forever on an empty stdin.

Fixes:
- call_shellcode: wait SHELL_SPAWN_DELAY seconds after sending the shellcode
  before sending shell commands, so the payload is the only thing in the
  target's first read() and the shell is reading by the time the commands
  arrive. This makes generated network shellcode exploits reliable in general,
  not just this test.
- test: pick guaranteed-free ports (bind to port 0) instead of random ports,
  which may already be in use (the target sets no SO_REUSEADDR, so its bind()
  then fails).
- test: wait until the target is actually listening (via psutil) before
  launching the exploit, instead of a fixed time.sleep() that races the
  server's startup under load.
- Drop the now-unnecessary @flaky wrapper and the flaky dependency.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 pyproject.toml                           |  1 -
 rex/exploit/techniques/call_shellcode.py | 12 ++++++-
 tests/test_rex.py                        | 46 +++++++++++++++++++-----
 3 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8bd8645..7ff15fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,6 @@ dependencies = [
     "povsim",
     "compilerex",
     "pwntools",
-    "flaky",
 ]
 
 [tool.setuptools.package-data]
diff --git a/rex/exploit/techniques/call_shellcode.py b/rex/exploit/techniques/call_shellcode.py
index 8b893c9..4469635 100644
--- a/rex/exploit/techniques/call_shellcode.py
+++ b/rex/exploit/techniques/call_shellcode.py
@@ -4,10 +4,14 @@
 from .. import Exploit, CannotExploit
 from ..technique import Technique
 from ..nopsleds import NopSleds
-from ..actions import RexCommandAction
+from ..actions import RexCommandAction, RexWaitAction
 
 l = logging.getLogger("rex.exploit.techniques.call_shellcode")
 
+# Seconds to wait after sending the shellcode before sending any follow-up shell
+# commands, so the popped shell is reading stdin by the time the commands arrive.
+SHELL_SPAWN_DELAY = 2
+
 class CallShellcode(Technique):
 
     name = "call_shellcode"
@@ -64,6 +68,12 @@ def apply(self, cmd=None, use_nopsled=True, **kwargs): #pylint:disable=arguments
             if not cmd.endswith(b"\n"):
                 cmd += b"\n"
             channel_name = self.crash.input_type_to_channel(self.crash.input_type)
+            # Give the shellcode time to land and exec the shell before we send
+            # any commands. Otherwise the payload and the commands go out back to
+            # back, and a target that services them with a single read() can
+            # swallow both at once -- consuming the commands before the shell is
+            # up, which leaves the shell blocking forever on an empty stdin.
+            self.crash.actions.append(RexWaitAction(SHELL_SPAWN_DELAY))
             act = RexCommandAction(cmd, channel_name=channel_name)
             self.crash.actions.append(act)
             act = RexCommandAction(b"exit\n", channel_name=channel_name)
diff --git a/tests/test_rex.py b/tests/test_rex.py
index 5af0f33..9af5d79 100644
--- a/tests/test_rex.py
+++ b/tests/test_rex.py
@@ -1,6 +1,6 @@
 # pylint: disable=line-too-long
 import os
-import random
+import socket
 import subprocess
 import sys
 import tempfile
@@ -8,6 +8,7 @@
 import struct
 import logging
 
+import psutil
 import pytest
 
 import archr
@@ -17,8 +18,6 @@
 from angr.state_plugins.trace_additions import FormatInfoStrToInt, FormatInfoDontConstrain
 from rex.exploit.cgc.type1.cgc_type1_shellcode_exploit import CGCType1ShellcodeExploit
 
-from flaky import flaky
-
 bin_location = str(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../binaries'))
 cache_location = str(os.path.join(bin_location, 'tests_data/rop_gadgets_cache'))
 tests_dir = str(os.path.dirname(os.path.realpath(__file__)))
@@ -235,14 +234,42 @@ def test_linux_stacksmash_32():
         _check_arsenal_has_send(exploit.arsenal)
 
 
-@flaky(max_runs=3, min_passes=1)
+def _get_free_tcp_port():
+    """Return a TCP port that is currently free."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+def _is_listening_on(port):
+    """Return True if any process is listening on TCP `port`."""
+    return any(
+        conn.status == psutil.CONN_LISTEN and conn.laddr and conn.laddr.port == port
+        for conn in psutil.net_connections(kind="tcp")
+    )
+
+
+def _wait_until_listening(port, timeout=30):
+    """Block until a process is listening on TCP `port`.
+
+    We check the socket state rather than connecting, because the target
+    accept()s exactly one connection -- a probe connection would be consumed
+    instead of the exploit's.
+    """
+    deadline = time.time() + timeout
+    while not _is_listening_on(port):
+        if time.time() > deadline:
+            raise TimeoutError(f"target never started listening on port {port}")
+        time.sleep(0.05)
+
+
 def test_linux_network_stacksmash_64():
     # Test exploiting a simple network server with a stack-based buffer overflow.
     inp = b'\x00' * 500
     lib_path = os.path.join(bin_location, "tests/x86_64")
     # ld_path = os.path.join(lib_path, "ld-linux-x86-64.so.2")
     path = os.path.join(lib_path, "network_overflow")
-    port = random.randint(8000, 9000)
+    port = _get_free_tcp_port()
     with archr.targets.LocalTarget([path, str(port)], path,
                                    target_arch='x86_64',
                                    ipv4_address="127.0.0.1",
@@ -258,7 +285,7 @@ def test_linux_network_stacksmash_64():
 
         # let's actually run the exploit
 
-    new_port = random.randint(9001, 10000)
+    new_port = _get_free_tcp_port()
     with archr.targets.LocalTarget([path, str(new_port)],
                                    path,
                                    target_arch='x86_64',
@@ -267,8 +294,9 @@ def test_linux_network_stacksmash_64():
         try:
             new_target.run_command("")
 
-            # wait for the target to load
-            time.sleep(.5)
+            # wait for the target to actually be listening before we launch the
+            # exploit (the generated script connects once, with no retry)
+            _wait_until_listening(new_port)
 
             temp_script = tempfile.NamedTemporaryFile(suffix=".py", delete=False)
             exploit_location = temp_script.name
@@ -278,7 +306,7 @@ def test_linux_network_stacksmash_64():
 
             exploit_result = subprocess.check_output(["python", exploit_location,
                                                       "127.0.0.1", str(new_port),
-                                                      ], timeout=3)
+                                                      ], timeout=30)
             assert b"hello" in exploit_result
         finally:
             os.unlink(exploit_location)