Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: copy entire file before unpacking #1002

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,5 @@ test-data/channels/conda-forge/

# Visual studio files
.vs/

__pycache__/
6 changes: 6 additions & 0 deletions crates/rattler_package_streaming/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,9 @@ walkdir = { workspace = true }
rstest = { workspace = true }
rstest_reuse = { workspace = true }
insta = { workspace = true, features = ["yaml"] }
criterion = { workspace = true, features = ["html_reports", "async_tokio"] }

[[bench]]
name = "extract"
harness = false

66 changes: 66 additions & 0 deletions crates/rattler_package_streaming/benches/extract.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use std::time::Duration;

use rattler_package_streaming::{reqwest::tokio::extract_tar_bz2, ExtractError, ExtractResult};
use reqwest::Client;
use reqwest_middleware::ClientWithMiddleware;
use tempfile::TempDir;
use url::Url;

use criterion::{criterion_group, criterion_main, Criterion};

enum TarUrls {
Python,

Check failure on line 12 in crates/rattler_package_streaming/benches/extract.rs

View workflow job for this annotation

GitHub Actions / Format and Lint

variants `Python` and `Boltons` are never constructed
Bat,
Boltons,
}

impl TarUrls {
fn get_url(&self) -> Url {
match self {
TarUrls::Python => Url::parse("http://localhost:8008/python").unwrap(),
TarUrls::Bat => Url::parse("http://localhost:8008/bat").unwrap(),
TarUrls::Boltons => Url::parse("http://localhost:8008/boltons").unwrap(),
}
}
}

async fn extract_tar(url: TarUrls) -> Result<ExtractResult, ExtractError> {
// Create a new temporary directory for this run
let temp_dir = TempDir::new().expect("Failed to create temporary directory");
let temp_path = temp_dir.path();

// Build a client without connection reuse
let client = Client::builder()
.pool_max_idle_per_host(0) // Disable connection pooling
.build()
.expect("Failed to build reqwest client");

extract_tar_bz2(
ClientWithMiddleware::from(client),
url.get_url(),
temp_path,
None,
None,
)
.await
}

/// Before running the benchmark
/// you need to start the server by running the following command:
/// `pixi run run-server`
fn criterion_benchmark(c: &mut Criterion) {
let rt = tokio::runtime::Runtime::new().unwrap();

c.bench_function("extract tars", |b| {
b.to_async(&rt).iter(|| async {
extract_tar(TarUrls::Bat).await.unwrap();
});
});
}

criterion_group!(
name = benches;
config = Criterion::default().measurement_time(Duration::from_secs(210)).sample_size(10);
targets = criterion_benchmark
);
criterion_main!(benches);
16 changes: 16 additions & 0 deletions crates/rattler_package_streaming/benches/pixi.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[project]
authors = ["nichmor <[email protected]>"]
channels = ["conda-forge"]
description = "Add a short description here"
name = "benches"
platforms = ["osx-arm64"]
version = "0.1.0"

[tasks]
run-server = "uvicorn webserver:app --host 0.0.0.0 --port 8008"

[dependencies]
fastapi = ">=0.115.6,<0.116"
requests = ">=2.32.3,<3"
python = ">=3.13.1,<3.14"
uvicorn = ">=0.34.0,<0.35"
58 changes: 58 additions & 0 deletions crates/rattler_package_streaming/benches/webserver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
import requests
import io

# Path to the Conda files
PYTHON_FILE_URL = "https://conda.anaconda.org/conda-forge/win-64/python-3.11.0-hcf16a7b_0_cpython.tar.bz2"
BOLTONS_FILE_URL = "https://repo.prefix.dev/conda-forge/noarch/boltons-21.0.0-pyhd8ed1ab_0.tar.bz2"
BAT_FILE_URL = "https://repo.prefix.dev/conda-forge/win-64/bat-0.22.1-h7f3b576_0.tar.bz2"

app = FastAPI()

# In-memory storage for files
file_cache = {}

async def fetch_conda_file(url: str) -> io.BytesIO:
"""Fetch the Conda file and return it as an in-memory BytesIO object."""
response = requests.get(url, stream=True)
response.raise_for_status()

memory_file = io.BytesIO()
for chunk in response.iter_content(chunk_size=8192):
memory_file.write(chunk)
memory_file.seek(0)
return memory_file

@app.on_event("startup")
async def load_files():
"""Download Conda files at server startup."""
global file_cache
print("Downloading Conda files...")
file_cache["python"] = await fetch_conda_file(PYTHON_FILE_URL)
file_cache["boltons"] = await fetch_conda_file(BOLTONS_FILE_URL)
file_cache["bat"] = await fetch_conda_file(BAT_FILE_URL)
print("Conda files downloaded and stored in memory.")


@app.get("/{file_name}")
async def serve_file(file_name: str):
"""Serve the requested file from in-memory storage."""
if file_name not in file_cache:
raise HTTPException(status_code=404, detail="File not found")

# Retrieve the file and reset the pointer
memory_file = file_cache[file_name]
memory_file.seek(0) # Reset the pointer to the start of the file

return StreamingResponse(
memory_file,
media_type="application/octet-stream",
headers={"Content-Disposition": f"attachment; filename={file_name}.tar.bz2"},
)


@app.get("/")
def list_files():
"""List available files."""
return {"available_files": list(file_cache.keys())}
9 changes: 8 additions & 1 deletion crates/rattler_package_streaming/src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,15 @@ pub fn extract_tar_bz2(
let mut md5_reader =
rattler_digest::HashingReader::<_, rattler_digest::Md5>::new(sha256_reader);

// Create a SpooledTempFile with a 5MB limit and copy the contents of the reader into it.
// Based on benchmarks this is faster than unpacking while downloading.
let mut temp_file = SpooledTempFile::new(5 * 1024 * 1024);

copy(&mut md5_reader, &mut temp_file)?;
temp_file.seek(SeekFrom::Start(0))?;

// Unpack the archive
stream_tar_bz2(&mut md5_reader).unpack(destination)?;
stream_tar_bz2(&mut temp_file).unpack(destination)?;

// Get the hashes
let (sha256_reader, md5) = md5_reader.finalize();
Expand Down
Loading