Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TLS connector/acceptor benchmarks using valgrind #367

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion pingora-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,27 @@ reqwest = { version = "0.11", features = ["rustls"], default-features = false }
hyperlocal = "0.8"
hyper = "0.14"
jemallocator = "0.5"
iai-callgrind = "0.13.1"
axum = { version = "0.7.5", features = ["http2"] }
axum-server = { version = "0.7.1", features = ["tls-rustls"] }

[features]
default = ["openssl"]
openssl = ["pingora-openssl"]
boringssl = ["pingora-boringssl"]
patched_http1 = []
patched_http1 = []

[[bench]]
name = "tls_connector"
harness = false

[[example]]
name = "bench_server"


[[bench]]
name = "tls_acceptor"
harness = false

[[example]]
name = "bench_client"
214 changes: 214 additions & 0 deletions pingora-core/benches/tls_acceptor.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
use ahash::{HashMap, HashMapExt};
use iai_callgrind::{
binary_benchmark, binary_benchmark_group, main, BinaryBenchmarkConfig, Command,
FlamegraphConfig, Tool, ValgrindTool,
};
use iai_callgrind::{Pipe, Stdin};
use reqwest::blocking::Client;
use reqwest::{Certificate, StatusCode, Version};
use std::env;
use std::fs::File;
use std::io::Read;
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
use tokio::task::JoinSet;

mod utils;

use utils::{generate_random_ascii_data, http_version_to_port, wait_for_tcp_connect, CERT_PATH};

main!(binary_benchmark_groups = tls_acceptor);

binary_benchmark_group!(
name = tls_acceptor;
config = BinaryBenchmarkConfig::default()
.flamegraph(FlamegraphConfig::default())
.tool(Tool::new(ValgrindTool::DHAT))
.raw_callgrind_args([""
// NOTE: toggle values can be extracted from .out files
// see '^fn=' values, need to be suffixed with '*' or '()'
// grep -E '^fn=' *.out | cut -d '=' -f2- | sort -u
//"--toggle-collect=pingora_core::services::listening::Service<A>::run_endpoint*"
// NOTE: for usage with callgrind::start_instrumentation() & stop_instrumentation()
//"--instr-atstart=no"
]);
benchmarks = bench_server
);

static SEQUENTIAL_REQUEST_COUNT: i32 = 64;
static SEQUENTIAL_REQUEST_SIZE: usize = 64;
static PARALLEL_ACCEPTORS: u16 = 16;
static PARALLEL_REQUEST_COUNT: i32 = SEQUENTIAL_REQUEST_COUNT / PARALLEL_ACCEPTORS as i32;
static PARALLEL_REQUEST_SIZE: usize = 64;
#[binary_benchmark]
#[bench::seq_http_11_handshake_always(args = (1, Version::HTTP_11),
setup = send_requests(1, false, Version::HTTP_11, SEQUENTIAL_REQUEST_COUNT, SEQUENTIAL_REQUEST_SIZE))]
#[bench::seq_http_11_handshake_once(args = (1, Version::HTTP_11),
setup = send_requests(1, true, Version::HTTP_11, SEQUENTIAL_REQUEST_COUNT, SEQUENTIAL_REQUEST_SIZE))]
#[bench::seq_http_2_handshake_always(args = (1, Version::HTTP_2),
setup = send_requests(1, false, Version::HTTP_2, SEQUENTIAL_REQUEST_COUNT, SEQUENTIAL_REQUEST_SIZE))]
#[bench::seq_http_2_handshake_once(args = (1, Version::HTTP_2),
setup = send_requests(1, true, Version::HTTP_2, SEQUENTIAL_REQUEST_COUNT, SEQUENTIAL_REQUEST_SIZE))]
#[bench::par_http_11_handshake_always(args = (PARALLEL_ACCEPTORS, Version::HTTP_11),
setup = send_requests(PARALLEL_ACCEPTORS, false, Version::HTTP_11, PARALLEL_REQUEST_COUNT, PARALLEL_REQUEST_SIZE))]
#[bench::par_http_11_handshake_once(args = (PARALLEL_ACCEPTORS, Version::HTTP_11),
setup = send_requests(PARALLEL_ACCEPTORS, true, Version::HTTP_11, PARALLEL_REQUEST_COUNT, PARALLEL_REQUEST_SIZE))]
#[bench::par_http_2_handshake_always(args = (PARALLEL_ACCEPTORS, Version::HTTP_2),
setup = send_requests(PARALLEL_ACCEPTORS, false, Version::HTTP_2, PARALLEL_REQUEST_COUNT, PARALLEL_REQUEST_SIZE))]
#[bench::par_http_2_handshake_once(args = (PARALLEL_ACCEPTORS, Version::HTTP_2),
setup = send_requests(PARALLEL_ACCEPTORS, true, Version::HTTP_2, PARALLEL_REQUEST_COUNT, PARALLEL_REQUEST_SIZE))]
fn bench_server(parallel_acceptors: u16, http_version: Version) -> Command {
let path = format!(
"{}/../target/release/examples/bench_server",
env!("CARGO_MANIFEST_DIR")
);
Command::new(path)
// TODO: currently a workaround to keep the setup function running parallel with benchmark execution
.stdin(Stdin::Setup(Pipe::Stderr))
.args([
format!("--http-version={:?}", http_version),
format!("--parallel-acceptors={}", parallel_acceptors),
])
.build()
}

fn send_requests(
parallel_acceptors: u16,
client_reuse: bool,
http_version: Version,
request_count: i32,
request_size: usize,
) {
tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap()
.block_on(async {
println!("Waiting for TCP connect...");
wait_for_tcp_connect(http_version, parallel_acceptors).await;
println!("TCP connect successful.");

println!("Sending benchmark requests...");
tls_post_data(
parallel_acceptors,
client_reuse,
http_version,
generate_random_ascii_data(request_count, request_size),
)
.await;
println!("Benchmark requests successfully sent.");
println!("Waiting for server(s) to gracefully shutdown...");
})
}

async fn tls_post_data(
parallel_acceptors: u16,
client_reuse: bool,
http_version: Version,
data: Vec<String>,
) {
if http_version != Version::HTTP_2 && http_version != Version::HTTP_11 {
panic!("HTTP version not supported");
}
let base_port = http_version_to_port(http_version);

let mut clients: HashMap<String, Client> = HashMap::new();
if client_reuse {
for i in 0..parallel_acceptors {
if http_version == Version::HTTP_11 {
clients.insert(i.to_string(), client_http11(base_port + i));
}
if http_version == Version::HTTP_2 {
clients.insert(i.to_string(), client_http2(base_port + i));
};
}
};

let mut req_set = JoinSet::new();
for i in 0..parallel_acceptors {
// spawn request for all elements within data
for d in data.iter() {
if client_reuse {
// reuse same connection & avoid new handshake
let reuse_client = clients.get(&i.to_string()).unwrap();
req_set.spawn(post_data(
reuse_client.to_owned(),
http_version,
base_port + i,
d.to_string(),
));
} else {
// always create a new client to ensure handshake is performed
if http_version == Version::HTTP_11 {
req_set.spawn(post_data(
client_http11(base_port + i),
http_version,
base_port + i,
d.to_string(),
));
}
if http_version == Version::HTTP_2 {
req_set.spawn(post_data(
client_http2(base_port + i),
http_version,
base_port + i,
d.to_string(),
));
};
}
}
}
// wait for all responses
while let Some(res) = req_set.join_next().await {
res.unwrap();
}
}

async fn post_data(client: Client, version: Version, port: u16, data: String) {
// using blocking client to reuse same connection in case of client_reuse=true
// async client does not ensure to use the same connection, will start a new one
// in case the existing is still blocked
let resp = client
.post(format! {"https://openrusty.org:{}", port})
.body(data)
.send()
.unwrap_or_else(|err| {
println!("HTTP client error: {err}");
panic!("error: {err}");
});

assert_eq!(resp.status(), StatusCode::OK);
assert_eq!(resp.version(), version);

// read full response, important for consistent tests
let _resp_body = resp.text().unwrap();
// println!("resp_body: {}", resp_body)
}

fn client_http11(port: u16) -> Client {
let socket = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), port);
Client::builder()
.resolve_to_addrs("openrusty.org", &[socket])
.add_root_certificate(read_cert())
.build()
.unwrap()
}

fn client_http2(port: u16) -> Client {
let socket = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), port);
Client::builder()
.resolve_to_addrs("openrusty.org", &[socket])
.add_root_certificate(read_cert())
// avoid error messages during first set of connections (os error 32, broken pipe)
.http2_prior_knowledge()
.build()
.unwrap()
}

fn read_cert() -> Certificate {
let mut buf = Vec::new();
File::open(CERT_PATH.to_string())
.unwrap()
.read_to_end(&mut buf)
.unwrap();
Certificate::from_pem(&buf).unwrap()
}
138 changes: 138 additions & 0 deletions pingora-core/benches/tls_benchmarks.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# TLS Benchmarks
The benchmarks are using [Valgrind](https://valgrind.org/) through the [iai_callgrind](https://docs.rs/iai-callgrind/latest/iai_callgrind/) benchmark framework.
For measuring performance the Valgrind tool [callgrind](https://valgrind.org/docs/manual/cl-manual.html) is used.

```mermaid
C4Context
title Overview

System_Ext(ContinuousIntegration, "Continuous Integration")
System_Boundary(OS, "Linux") {
System(Cargo, "Cargo", "bench")
Container(Results, "Benchmark Results")
System_Boundary(Valgrind, "Valgrind") {
Container(LogFile, "Log File")
System_Ext(Valgrind, "Valgrind")
Container(CallGraph, "Call Graph")
Rel(Valgrind, CallGraph, "creates")
Rel(Valgrind, LogFile, "creates")
}
Rel(Cargo, Valgrind, "executes")
}

Person(Developer, "Developer")
System_Ext(QCacheGrind, "QCacheGrind", "KCacheGrind")

Rel(Developer, Cargo, "runs")
Rel(ContinuousIntegration, Cargo, "runs")

Rel(Developer, QCacheGrind, "utilizes")
Rel(QCacheGrind, CallGraph, "to visualize")

Rel(Cargo, Results, "reports")

```

## Visualization
With [kcachegrind](https://github.com/KDE/kcachegrind)/[qcachegrind](https://github.com/KDE/kcachegrind) the call-graphs
can be interactively visualized and navigated.

[gprof2dot](https://github.com/jrfonseca/gprof2dot) and [graphviz](https://graphviz.org/) can create call-graph images.

```bash
gprof2dot -f callgrind *out | dot -T png -o out.png
```

The iai_callgrind default [Flamegrahps](https://docs.rs/iai-callgrind/latest/iai_callgrind/struct.FlamegraphConfig.html#impl-Default-for-FlamegraphConfig)
are activated and stored in [SVG](https://en.wikipedia.org/wiki/SVG) format next to the call-graph files.


## Technical Details
The TLS Benchmarks are intended to capture full `connect/accept` cycles. To benchmark such scenario it is required
to have some parallel processes running (`server/client`) while only one of them should be benchmarked.

### Challenges
pingora-core uses [tokio](https://tokio.rs/) as runtime and [pingora-core::server::Server](https://docs.rs/pingora-core/latest/pingora_core/server/struct.Server.html)
spawns threads when being setup.
This leads to implications on the benchmark process as multiple threads need to be covered.

As tokio is used and network requests are issued during the benchmarking the results will always have a certain variance.

To limit the variance impact the following pre-cautions where considered:
- running benchmarks (where possible) within a single thread and utilize tokio single-threaded runtime
- issuing multiple requests during benchmarking

### Scenario Setup
Within `pingora-core/examples/` the BinaryBenchmark Command executables for benchmarking are built using `dev-dependencies`.
The `pingora-core/benches/` contains the opposite side and the iai_callgrind definitions.

The benchmarked part (`server/client` executable) is built with `pingora-core`. The opposite part is built using
external components (`reqwest/axum`).

The `servers` are instantiated to accept `POST` requests and echo the transmitted bodies in the response.

The binaries (`bench_server/bench_client`) are launched through iai_callgrind as [BinaryBenchmark](https://docs.rs/iai-callgrind/latest/iai_callgrind/struct.BinaryBenchmark.html)
within `valgrind/callgrind`.
The BinaryBenchmark [setup](https://docs.rs/iai-callgrind/latest/iai_callgrind/struct.BinaryBenchmark.html#structfield.setup)
function is used to run the opposite part (`client/server`) of the benchmark in parallel.

For the server benchmark scenario the layout looks like:
- iai_callgrind starts the client on the setup
- the client waits for a TCP connect before issuing the requests
- iai_callgrind launches the server within valgrind
- once the server is up the setup function client successfuly connects and starts to run the requests
- the server stops after a pre-configured period of time

```mermaid
sequenceDiagram
iai_callgrind->>Setup (Client): starts
Setup (Client)->>BinaryBechmark (Server): TcpStream::connect
BinaryBechmark (Server)-->>Setup (Client): Failed - Startup Phase
iai_callgrind->>BinaryBechmark (Server): starts
Setup (Client)->>BinaryBechmark (Server): TcpStream::connect
BinaryBechmark (Server)->>Setup (Client): Succeeded - Server Running
Setup (Client)->>BinaryBechmark (Server): HTTP Request
BinaryBechmark (Server)->>Setup (Client): HTTP Response
iai_callgrind->>BinaryBechmark (Server): waits for success
iai_callgrind->>Setup (Client): waits for success
```

For the client benchmark the setup is similar, but inverse as the server runs within the iai_callgrind setup function.

### Running
The benchmarks can be run using the following commands:
```bash
VERSION="$(cargo metadata --format-version=1 |\
jq -r '.packages[] | select(.name == "iai-callgrind").version')"
cargo install iai-callgrind-runner --version "${VERSION}"

FEATURES="openssl"
cargo build --no-default-features --features "${FEATURES}" --release --examples
cargo bench --no-default-features --features "${FEATURES}" --package pingora-core --bench tls_acceptor -- --nocapture
cargo bench --no-default-features --features "${FEATURES}" --package pingora-core --bench tls_connector -- --nocapture
```

### Output
Generated benchmark files are located below `target/iai/`:
```
target/iai/
└── pingora-core # <cargo-workspace>
└── tls_acceptor # <cargo-bench-name>
└── tls_acceptor # <iai-benchmark-group>
└── bench_server.http_11_handshake_always # <iai-benchmark-group>.<iai-benchmark-name>
├── callgrind.bench_server.http_11_handshake_always.flamegraph.Ir.diff.old.svg
├── callgrind.bench_server.http_11_handshake_always.flamegraph.Ir.old.svg
├── callgrind.bench_server.http_11_handshake_always.flamegraph.Ir.svg
├── callgrind.bench_server.http_11_handshake_always.log
├── callgrind.bench_server.http_11_handshake_always.log.old
├── callgrind.bench_server.http_11_handshake_always.out
└── callgrind.bench_server.http_11_handshake_always.out.old
```

### Parameters
Server and client benchmark are parameterized with the following options:
- number of parallel acceptors/connectors and servers/clients
- client/session re-use
- HTTP version `1.1|2.0`
- number of requests
- request body size
Loading