cloudflare · hargut · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024 · Sep 6, 2024
diff --git a/pingora-core/Cargo.toml b/pingora-core/Cargo.toml
@@ -76,9 +76,27 @@ reqwest = { version = "0.11", features = ["rustls"], default-features = false }
 hyperlocal = "0.8"
 hyper = "0.14"
 jemallocator = "0.5"
+iai-callgrind = "0.13.1"
+axum = { version = "0.7.5", features = ["http2"] }
+axum-server = { version = "0.7.1", features = ["tls-rustls"] }
 
 [features]
 default = ["openssl"]
 openssl = ["pingora-openssl"]
 boringssl = ["pingora-boringssl"]
-patched_http1 = []
+patched_http1 = []
+
+[[bench]]
+name = "tls_connector"
+harness = false
+
+[[example]]
+name = "bench_server"
+
+
+[[bench]]
+name = "tls_acceptor"
+harness = false
+
+[[example]]
+name = "bench_client"
diff --git a/pingora-core/benches/tls_acceptor.rs b/pingora-core/benches/tls_acceptor.rs
@@ -0,0 +1,214 @@
+use ahash::{HashMap, HashMapExt};
+use iai_callgrind::{
+    binary_benchmark, binary_benchmark_group, main, BinaryBenchmarkConfig, Command,
+    FlamegraphConfig, Tool, ValgrindTool,
+};
+use iai_callgrind::{Pipe, Stdin};
+use reqwest::blocking::Client;
+use reqwest::{Certificate, StatusCode, Version};
+use std::env;
+use std::fs::File;
+use std::io::Read;
+use std::net::{IpAddr, Ipv4Addr, SocketAddr};
+use tokio::task::JoinSet;
+
+mod utils;
+
+use utils::{generate_random_ascii_data, http_version_to_port, wait_for_tcp_connect, CERT_PATH};
+
+main!(binary_benchmark_groups = tls_acceptor);
+
+binary_benchmark_group!(
+    name = tls_acceptor;
+    config = BinaryBenchmarkConfig::default()
+        .flamegraph(FlamegraphConfig::default())
+        .tool(Tool::new(ValgrindTool::DHAT))
+        .raw_callgrind_args([""
+            // NOTE: toggle values can be extracted from .out files
+            // see '^fn=' values, need to be suffixed with '*' or '()'
+            // grep -E '^fn=' *.out | cut -d '=' -f2- | sort -u
+            //"--toggle-collect=pingora_core::services::listening::Service<A>::run_endpoint*"
+            // NOTE: for usage with callgrind::start_instrumentation() & stop_instrumentation()
+            //"--instr-atstart=no"
+    ]);
+    benchmarks = bench_server
+);
+
+static SEQUENTIAL_REQUEST_COUNT: i32 = 64;
+static SEQUENTIAL_REQUEST_SIZE: usize = 64;
+static PARALLEL_ACCEPTORS: u16 = 16;
+static PARALLEL_REQUEST_COUNT: i32 = SEQUENTIAL_REQUEST_COUNT / PARALLEL_ACCEPTORS as i32;
+static PARALLEL_REQUEST_SIZE: usize = 64;
+#[binary_benchmark]
+#[bench::seq_http_11_handshake_always(args = (1, Version::HTTP_11),
+    setup = send_requests(1, false, Version::HTTP_11, SEQUENTIAL_REQUEST_COUNT, SEQUENTIAL_REQUEST_SIZE))]
+#[bench::seq_http_11_handshake_once(args = (1, Version::HTTP_11),
+    setup = send_requests(1, true, Version::HTTP_11, SEQUENTIAL_REQUEST_COUNT, SEQUENTIAL_REQUEST_SIZE))]
+#[bench::seq_http_2_handshake_always(args = (1, Version::HTTP_2),
+    setup = send_requests(1, false, Version::HTTP_2, SEQUENTIAL_REQUEST_COUNT, SEQUENTIAL_REQUEST_SIZE))]
+#[bench::seq_http_2_handshake_once(args = (1, Version::HTTP_2),
+    setup = send_requests(1, true, Version::HTTP_2, SEQUENTIAL_REQUEST_COUNT, SEQUENTIAL_REQUEST_SIZE))]
+#[bench::par_http_11_handshake_always(args = (PARALLEL_ACCEPTORS, Version::HTTP_11),
+    setup = send_requests(PARALLEL_ACCEPTORS, false, Version::HTTP_11, PARALLEL_REQUEST_COUNT, PARALLEL_REQUEST_SIZE))]
+#[bench::par_http_11_handshake_once(args = (PARALLEL_ACCEPTORS, Version::HTTP_11),
+    setup = send_requests(PARALLEL_ACCEPTORS, true, Version::HTTP_11, PARALLEL_REQUEST_COUNT, PARALLEL_REQUEST_SIZE))]
+#[bench::par_http_2_handshake_always(args = (PARALLEL_ACCEPTORS, Version::HTTP_2),
+    setup = send_requests(PARALLEL_ACCEPTORS, false, Version::HTTP_2, PARALLEL_REQUEST_COUNT, PARALLEL_REQUEST_SIZE))]
+#[bench::par_http_2_handshake_once(args = (PARALLEL_ACCEPTORS, Version::HTTP_2),
+    setup = send_requests(PARALLEL_ACCEPTORS, true, Version::HTTP_2, PARALLEL_REQUEST_COUNT, PARALLEL_REQUEST_SIZE))]
+fn bench_server(parallel_acceptors: u16, http_version: Version) -> Command {
+    let path = format!(
+        "{}/../target/release/examples/bench_server",
+        env!("CARGO_MANIFEST_DIR")
+    );
+    Command::new(path)
+        // TODO: currently a workaround to keep the setup function running parallel with benchmark execution
+        .stdin(Stdin::Setup(Pipe::Stderr))
+        .args([
+            format!("--http-version={:?}", http_version),
+            format!("--parallel-acceptors={}", parallel_acceptors),
+        ])
+        .build()
+}
+
+fn send_requests(
+    parallel_acceptors: u16,
+    client_reuse: bool,
+    http_version: Version,
+    request_count: i32,
+    request_size: usize,
+) {
+    tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap()
+        .block_on(async {
+            println!("Waiting for TCP connect...");
+            wait_for_tcp_connect(http_version, parallel_acceptors).await;
+            println!("TCP connect successful.");
+
+            println!("Sending benchmark requests...");
+            tls_post_data(
+                parallel_acceptors,
+                client_reuse,
+                http_version,
+                generate_random_ascii_data(request_count, request_size),
+            )
+            .await;
+            println!("Benchmark requests successfully sent.");
+            println!("Waiting for server(s) to gracefully shutdown...");
+        })
+}
+
+async fn tls_post_data(
+    parallel_acceptors: u16,
+    client_reuse: bool,
+    http_version: Version,
+    data: Vec<String>,
+) {
+    if http_version != Version::HTTP_2 && http_version != Version::HTTP_11 {
+        panic!("HTTP version not supported");
+    }
+    let base_port = http_version_to_port(http_version);
+
+    let mut clients: HashMap<String, Client> = HashMap::new();
+    if client_reuse {
+        for i in 0..parallel_acceptors {
+            if http_version == Version::HTTP_11 {
+                clients.insert(i.to_string(), client_http11(base_port + i));
+            }
+            if http_version == Version::HTTP_2 {
+                clients.insert(i.to_string(), client_http2(base_port + i));
+            };
+        }
+    };
+
+    let mut req_set = JoinSet::new();
+    for i in 0..parallel_acceptors {
+        // spawn request for all elements within data
+        for d in data.iter() {
+            if client_reuse {
+                // reuse same connection & avoid new handshake
+                let reuse_client = clients.get(&i.to_string()).unwrap();
+                req_set.spawn(post_data(
+                    reuse_client.to_owned(),
+                    http_version,
+                    base_port + i,
+                    d.to_string(),
+                ));
+            } else {
+                // always create a new client to ensure handshake is performed
+                if http_version == Version::HTTP_11 {
+                    req_set.spawn(post_data(
+                        client_http11(base_port + i),
+                        http_version,
+                        base_port + i,
+                        d.to_string(),
+                    ));
+                }
+                if http_version == Version::HTTP_2 {
+                    req_set.spawn(post_data(
+                        client_http2(base_port + i),
+                        http_version,
+                        base_port + i,
+                        d.to_string(),
+                    ));
+                };
+            }
+        }
+    }
+    // wait for all responses
+    while let Some(res) = req_set.join_next().await {
+        res.unwrap();
+    }
+}
+
+async fn post_data(client: Client, version: Version, port: u16, data: String) {
+    // using blocking client to reuse same connection in case of client_reuse=true
+    // async client does not ensure to use the same connection, will start a new one
+    // in case the existing is still blocked
+    let resp = client
+        .post(format! {"https://openrusty.org:{}", port})
+        .body(data)
+        .send()
+        .unwrap_or_else(|err| {
+            println!("HTTP client error: {err}");
+            panic!("error: {err}");
+        });
+
+    assert_eq!(resp.status(), StatusCode::OK);
+    assert_eq!(resp.version(), version);
+
+    // read full response, important for consistent tests
+    let _resp_body = resp.text().unwrap();
+    // println!("resp_body: {}", resp_body)
+}
+
+fn client_http11(port: u16) -> Client {
+    let socket = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), port);
+    Client::builder()
+        .resolve_to_addrs("openrusty.org", &[socket])
+        .add_root_certificate(read_cert())
+        .build()
+        .unwrap()
+}
+
+fn client_http2(port: u16) -> Client {
+    let socket = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), port);
+    Client::builder()
+        .resolve_to_addrs("openrusty.org", &[socket])
+        .add_root_certificate(read_cert())
+        // avoid error messages during first set of connections (os error 32, broken pipe)
+        .http2_prior_knowledge()
+        .build()
+        .unwrap()
+}
+
+fn read_cert() -> Certificate {
+    let mut buf = Vec::new();
+    File::open(CERT_PATH.to_string())
+        .unwrap()
+        .read_to_end(&mut buf)
+        .unwrap();
+    Certificate::from_pem(&buf).unwrap()
+}
diff --git a/pingora-core/benches/tls_benchmarks.md b/pingora-core/benches/tls_benchmarks.md
@@ -0,0 +1,138 @@
+# TLS Benchmarks
+The benchmarks are using [Valgrind](https://valgrind.org/) through the [iai_callgrind](https://docs.rs/iai-callgrind/latest/iai_callgrind/) benchmark framework.
+For measuring performance the Valgrind tool [callgrind](https://valgrind.org/docs/manual/cl-manual.html) is used.
+
+```mermaid
+C4Context
+    title Overview
+
+    System_Ext(ContinuousIntegration, "Continuous Integration")
+    System_Boundary(OS, "Linux") {
+        System(Cargo, "Cargo", "bench")
+        Container(Results, "Benchmark Results")
+        System_Boundary(Valgrind, "Valgrind") {
+            Container(LogFile, "Log File")
+            System_Ext(Valgrind, "Valgrind")
+            Container(CallGraph, "Call Graph")
+            Rel(Valgrind, CallGraph, "creates")
+            Rel(Valgrind, LogFile, "creates")
+        }
+        Rel(Cargo, Valgrind, "executes")
+    }
+
+    Person(Developer, "Developer")
+    System_Ext(QCacheGrind, "QCacheGrind", "KCacheGrind")
+
+    Rel(Developer, Cargo, "runs")
+    Rel(ContinuousIntegration, Cargo, "runs")
+
+    Rel(Developer, QCacheGrind, "utilizes")
+    Rel(QCacheGrind, CallGraph, "to visualize")
+
+    Rel(Cargo, Results, "reports")
+
+```
+
+## Visualization
+With [kcachegrind](https://github.com/KDE/kcachegrind)/[qcachegrind](https://github.com/KDE/kcachegrind) the call-graphs
+can be interactively visualized and navigated.
+
+[gprof2dot](https://github.com/jrfonseca/gprof2dot) and [graphviz](https://graphviz.org/) can create call-graph images.
+
+```bash
+gprof2dot -f callgrind *out | dot -T png -o out.png
+```
+
+The iai_callgrind default [Flamegrahps](https://docs.rs/iai-callgrind/latest/iai_callgrind/struct.FlamegraphConfig.html#impl-Default-for-FlamegraphConfig)
+are activated and stored in [SVG](https://en.wikipedia.org/wiki/SVG) format next to the call-graph files. 
+
+
+## Technical Details
+The TLS Benchmarks are intended to capture full `connect/accept` cycles.  To benchmark such scenario it is required
+to have some parallel processes running (`server/client`) while only one of them should be benchmarked.
+
+### Challenges
+pingora-core uses [tokio](https://tokio.rs/) as runtime and [pingora-core::server::Server](https://docs.rs/pingora-core/latest/pingora_core/server/struct.Server.html)
+spawns threads when being setup.
+This leads to implications on the benchmark process as multiple threads need to be covered.
+
+As tokio is used and network requests are issued during the benchmarking the results will always have a certain variance.
+
+To limit the variance impact the following pre-cautions where considered:
+- running benchmarks (where possible) within a single thread and utilize tokio single-threaded runtime
+- issuing multiple requests during benchmarking
+
+### Scenario Setup
+Within `pingora-core/examples/` the BinaryBenchmark Command executables for benchmarking are built using `dev-dependencies`.
+The `pingora-core/benches/` contains the opposite side and the iai_callgrind definitions.
+
+The benchmarked part (`server/client` executable) is built with `pingora-core`. The opposite part is built using
+external components (`reqwest/axum`).
+
+The `servers` are instantiated to accept `POST` requests and echo the transmitted bodies in the response.
+
+The binaries (`bench_server/bench_client`) are launched through iai_callgrind as [BinaryBenchmark](https://docs.rs/iai-callgrind/latest/iai_callgrind/struct.BinaryBenchmark.html)
+within `valgrind/callgrind`.
+The BinaryBenchmark [setup](https://docs.rs/iai-callgrind/latest/iai_callgrind/struct.BinaryBenchmark.html#structfield.setup)
+function is used to run the opposite part (`client/server`) of the benchmark in parallel.
+
+For the server benchmark scenario the layout looks like:
+- iai_callgrind starts the client on the setup
+   - the client waits for a TCP connect before issuing the requests
+- iai_callgrind launches the server within valgrind
+- once the server is up the setup function client successfuly connects and starts to run the requests
+- the server stops after a pre-configured period of time
+
+```mermaid
+sequenceDiagram
+    iai_callgrind->>Setup (Client): starts
+    Setup (Client)->>BinaryBechmark (Server): TcpStream::connect
+    BinaryBechmark (Server)-->>Setup (Client): Failed - Startup Phase
+    iai_callgrind->>BinaryBechmark (Server): starts
+    Setup (Client)->>BinaryBechmark (Server): TcpStream::connect
+    BinaryBechmark (Server)->>Setup (Client): Succeeded - Server Running
+    Setup (Client)->>BinaryBechmark (Server): HTTP Request
+    BinaryBechmark (Server)->>Setup (Client): HTTP Response
+    iai_callgrind->>BinaryBechmark (Server): waits for success
+    iai_callgrind->>Setup (Client): waits for success
+```
+
+For the client benchmark the setup is similar, but inverse as the server runs within the iai_callgrind setup function.
+
+### Running
+The benchmarks can be run using the following commands:
+```bash
+VERSION="$(cargo metadata --format-version=1 |\
+  jq -r '.packages[] | select(.name == "iai-callgrind").version')"
+cargo install iai-callgrind-runner --version "${VERSION}"
+
+FEATURES="openssl"
+cargo build --no-default-features --features "${FEATURES}" --release --examples
+cargo bench --no-default-features --features "${FEATURES}" --package pingora-core --bench tls_acceptor -- --nocapture
+cargo bench --no-default-features --features "${FEATURES}" --package pingora-core --bench tls_connector -- --nocapture
+```
+
+### Output
+Generated benchmark files are located below `target/iai/`:
+```
+target/iai/
+└── pingora-core   # <cargo-workspace>
+    └── tls_acceptor   # <cargo-bench-name>
+        └── tls_acceptor   # <iai-benchmark-group>
+            └── bench_server.http_11_handshake_always   # <iai-benchmark-group>.<iai-benchmark-name>
+                      ├── callgrind.bench_server.http_11_handshake_always.flamegraph.Ir.diff.old.svg
+                      ├── callgrind.bench_server.http_11_handshake_always.flamegraph.Ir.old.svg
+                      ├── callgrind.bench_server.http_11_handshake_always.flamegraph.Ir.svg
+                      ├── callgrind.bench_server.http_11_handshake_always.log
+                      ├── callgrind.bench_server.http_11_handshake_always.log.old
+                      ├── callgrind.bench_server.http_11_handshake_always.out
+                      └── callgrind.bench_server.http_11_handshake_always.out.old
+```
+
+### Parameters
+Server and client benchmark are parameterized with the following options:
+- number of parallel acceptors/connectors and servers/clients
+- client/session re-use
+- HTTP version `1.1|2.0`
+- number of requests
+- request body size