Skip to content

Commit

Permalink
Python: Set up read only file system backed by packages tar (#1555)
Browse files Browse the repository at this point in the history
This is a startup performance optimization. Instead of copying the data around,
this uses the tar file in place as the backing for the files.
  • Loading branch information
hoodmane authored Jan 23, 2024
1 parent 249c080 commit f4f29e8
Show file tree
Hide file tree
Showing 15 changed files with 345 additions and 80 deletions.
5 changes: 4 additions & 1 deletion build/capnp_embed.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def _capnp_embed_impl(ctx):
_capnp_embed = rule(
attrs = {
"src": attr.label(allow_single_file = True),
"deps": attr.label_list(),
},
implementation = _capnp_embed_impl
)
Expand All @@ -23,6 +24,7 @@ def capnp_embed(
src,
visibility = None,
target_compatible_with = None,
deps = [],
):
"""
Bazel rule to include `src` in a Cap'n Proto search path for embedding.
Expand All @@ -34,7 +36,8 @@ def capnp_embed(
name = name + "_gen",
src = src,
visibility = visibility,
target_compatible_with = target_compatible_with
target_compatible_with = target_compatible_with,
deps = deps
)
native.cc_library(
name = name
Expand Down
74 changes: 58 additions & 16 deletions src/pyodide/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -1,7 +1,42 @@
load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
load("@bazel_skylib//rules:copy_directory.bzl", "copy_directory")
load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
load("@workerd//:build/wd_js_bundle.bzl", "wd_js_bundle")
load("@capnp-cpp//src/capnp:cc_capnp_library.bzl", "cc_capnp_library")
load("//:build/capnp_embed.bzl", "capnp_embed")
load("//:build/wd_js_bundle.bzl", "wd_js_bundle")

copy_file(
name = "pyodide_packages_archive",
src = "@pyodide_packages//:pyodide_packages_unzipped_0.2.tar",
out = "generated/pyodide_packages_unzipped_0.1.tar",
)

capnp_embed(
name = "pyodide_packages_archive_embed",
src = "generated/pyodide_packages_unzipped_0.1.tar",
deps = ["pyodide_packages_archive"],
)

copy_file(
name = "packages_capnp_file",
src = "pyodide_packages.capnp",
out = "generated/pyodide_packages.capnp",
)

capnp_embed(
name = "packages_capnp_file_embed",
src = "generated/pyodide_packages.capnp",
deps = ["packages_capnp_file"],
)

cc_capnp_library(
name = "pyodide_packages_capnp",
srcs = ["generated/pyodide_packages.capnp"],
visibility = ["//visibility:public"],
deps = [
":packages_capnp_file_embed",
":pyodide_packages_archive_embed",
],
)

copy_file(
name = "pyodide.asm.wasm@rule",
Expand All @@ -15,38 +50,46 @@ copy_file(
out = "generated/python_stdlib.zip",
)

copy_file(
name = "pyodide_packages_file.zip@rule",
src = "@pyodide_packages//:pyodide_packages_unzipped_0.2.tar",
out = "generated/pyodide_packages_unzipped_0.2.tar",
)

PRELUDE = """
import { newWasmModule, monotonicDateNow, wasmInstantiate } from "pyodide-internal:builtin_wrappers";
function addEventListener(){}
"""

REPLACEMENTS = [
["var _createPyodideModule", PRELUDE + "export const _createPyodideModule"],
["globalThis._createPyodideModule = _createPyodideModule;", ""],
["new WebAssembly.Module", "newWasmModule"],
["WebAssembly.instantiate", "wasmInstantiate"],
["Date.now", "monotonicDateNow"],
[
"var _createPyodideModule",
PRELUDE + "export const _createPyodideModule",
],
[
"globalThis._createPyodideModule = _createPyodideModule;",
"",
],
[
"new WebAssembly.Module",
"newWasmModule",
],
[
"WebAssembly.instantiate",
"wasmInstantiate",
],
[
"Date.now",
"monotonicDateNow",
],
]

expand_template(
name = "pyodide.asm.js@rule",
out = "generated/pyodide.asm.js",
template = "@pyodide//:pyodide/pyodide.asm.js",
substitutions = dict(REPLACEMENTS),
template = "@pyodide//:pyodide/pyodide.asm.js",
)

wd_js_bundle(
name = "pyodide",
builtin_modules = [
"python.js",
"generated/pyodide_packages_unzipped_0.2.tar"
] + glob(["internal/patches/*.py"]),
import_name = "pyodide",
internal_data_modules = ["generated/python_stdlib.zip"],
Expand All @@ -61,6 +104,5 @@ wd_js_bundle(
"pyodide.asm.js@rule",
"pyodide.asm.wasm@rule",
"python_stdlib.zip@rule",
"pyodide_packages_file.zip@rule"
],
)
1 change: 1 addition & 0 deletions src/pyodide/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ The present approach is just the fastest way to get something working.
## What's happening here?

Pyodide's distribution consists of:

1. The main "emscripten binary" which is `pyodide.asm.js` and `pyodide.asm.wasm`
2. A loader `pyodide.js`
3. The Python + Pyodide stdlib `python_stdlib.zip`
Expand Down
2 changes: 1 addition & 1 deletion src/pyodide/internal/builtin_wrappers.js
Original file line number Diff line number Diff line change
Expand Up @@ -110,5 +110,5 @@ export async function wasmInstantiate(module, imports) {
module = UnsafeEval.newWasmModule(module);
}
const instance = new WebAssembly.Instance(module, imports);
return {module, instance};
return { module, instance };
}
37 changes: 9 additions & 28 deletions src/pyodide/internal/pyodide-bootstrap.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@ import { loadPyodide } from "pyodide:python";
import { getMetadata } from "pyodide:current-bundle";
import { lockFile } from "pyodide:package-lock.json";
import { getPatches } from "pyodide:patches";
import embeddedPackages from "pyodide:embedded_packages";

function initializePackageIndex(pyodide, lockfile) {
if (!lockfile.packages) {
throw new Error(
"Loaded pyodide lock file does not contain the expected key 'packages'.",
"Loaded pyodide lock file does not contain the expected key 'packages'."
);
}
const API = pyodide._api;
Expand Down Expand Up @@ -83,7 +82,7 @@ const EMBEDDED_PYTHON_PACKAGES = [
"jsonpointer",
"mypy_extensions",
"micropip",
"packaging"
"packaging",
];

function transformMetadata(metadata) {
Expand All @@ -104,18 +103,18 @@ function transformMetadata(metadata) {
metadata.globals.push({
name: module.name,
value: {
pythonModule: module.pythonModule
}
})
pythonModule: module.pythonModule,
},
});
}

if (module.pythonRequirement !== undefined) {
metadata.globals.push({
name: module.name,
value: {
pythonRequirement: module.pythonRequirement
}
})
pythonRequirement: module.pythonRequirement,
},
});
}
}
return metadata;
Expand Down Expand Up @@ -150,22 +149,6 @@ export default {
}

if (hasRequirements) {
const name = "pyodide_packages_unzipped_0.2.tar";
const path = `/lib/python3.11/site-packages/${name}`;
pyodide.FS.writeFile(path, new Uint8Array(embeddedPackages), {
encoding: 'binary',
});

pyodide.runPython(`
import tarfile
import os
tar_file_path = "${path}"
containing_dir = os.path.dirname(tar_file_path)
with tarfile.open(tar_file_path, 'r') as tar:
tar.extractall(containing_dir)
`)

const micropip = pyodide.pyimport("micropip");
if (micropipRequirements.length > 0) {
// Micropip and ssl packages are contained in the tarball which is extracted above. This means
Expand All @@ -175,13 +158,11 @@ export default {

// Apply patches that enable some packages to work.
const patches = getPatches();
// TODO(EW-8055): Why does micropip.list not work?
if (JSON.parse(micropip.freeze())["packages"]["aiohttp"] !== undefined) {
if (micropip.list().has("aiohttp") !== undefined) {
pyodide.runPython(patches["aiohttp_fetch_patch.py"]);
}
}


await pyodide.loadPackage(pythonRequirements);

return await pyodide.pyimport(metadata.mainModule).fetch(request);
Expand Down
124 changes: 124 additions & 0 deletions src/pyodide/internal/tar.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import { default as Reader } from "pyodide-internal:packages_tar_reader";

// This is based on the info about the tar file format on wikipedia
// And some trial and error with real tar files.
// https://en.wikipedia.org/wiki/Tar_(computing)#File_format


const decoder = new TextDecoder();
function decodeString(buf) {
const nullIdx = buf.indexOf(0);
if (nullIdx >= 0) {
buf = buf.subarray(0, nullIdx);
}
return decoder.decode(buf);
}
function decodeField(buf, offset, size) {
return decodeString(buf.subarray(offset, offset + size));
}
function decodeNumber(buf, offset, size) {
return parseInt(decodeField(buf, offset, size), 8);
}

function decodeHeader(buf) {
const nameBase = decodeField(buf, 0, 100);
const namePrefix = decodeField(buf, 345, 155);
let path = namePrefix + nameBase;
// Trim possible leading ./
if (path.startsWith("./")) {
path = path.slice(2);
}
const mode = decodeNumber(buf, 100, 8);
const size = decodeNumber(buf, 124, 12);
const modtime = decodeNumber(buf, 136, 12);
const type = Number(String.fromCharCode(buf[156]));
return {
path,
name: path,
mode,
size,
modtime,
type,
parts: [],
children: undefined,
};
}

export function parseTarInfo() {
const directories = [];
const soFiles = [];
const root = {
children: new Map(),
mode: 0o777,
type: 5,
modtime: 0,
size: 0,
path: "",
name: "",
parts: [],
};
let directory = root;
const buf = new Uint8Array(512);
let offset = 0;
while (true) {
Reader.read(offset, buf);
const info = decodeHeader(buf);
if (isNaN(info.mode)) {
// Invalid mode means we're done
return [root, soFiles];
}
const contentsOffset = offset + 512;
offset += 512 * Math.ceil(info.size / 512 + 1);
if (info.path === "") {
// skip possible leading ./ directory
continue;
}
if (info.path.includes("PaxHeader")) {
// Ignore PaxHeader extension
// These metadata directories don't actually have a directory entry which
// is going to cause us to crash below.
// Our tar files shouldn't have these anyways...
continue;
}

// Navigate to the correct directory by going up until we're at the common
// ancestor of the current position and the target then back down.
//
// Most tar files I run into are lexicographically sorted, so the "go back
// down" step is not necessary. But some tar files are a bit out of order.
//
// We do rely on the fact that the entry for a given directory appears
// before any files in the directory. I don't see anywhere in the spec where
// it says this is required but I think it would be weird and annoying for a
// tar file to violate this property.

// go up to common ancestor
while (directories.length && !info.name.startsWith(directory.path)) {
directory = directories.pop();
}
// go down to target (in many tar files this second loop body is evaluated 0
// times)
const parts = info.path.slice(0, -1).split("/");
for (let i = directories.length; i < parts.length - 1; i++) {
directories.push(directory);
directory = directory.children.get(parts[i]);
}
if (info.type === 5) {
// a directory
directories.push(directory);
info.parts = parts;
info.name = info.parts.at(-1);
info.children = new Map();
directory.children.set(info.name, info);
directory = info;
} else {
// hopefully a normal file, we ignore other values of type (e.g., symlink)
info.contentsOffset = contentsOffset;
info.name = info.path.slice(directory.path.length);
if (info.name.endsWith(".so")) {
soFiles.push(info.path);
}
directory.children.set(info.name, info);
}
}
}
Loading

0 comments on commit f4f29e8

Please sign in to comment.