Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ docs = [
"markupsafe<4.0.0",
"linkchecker~=10.6.0",
]
bench = [
"pytest-benchmark>=5.2.1",
"rdflib>=7.1.4",
]

[tool.uv]
required-version = "~=0.9.0"
Expand All @@ -110,7 +114,14 @@ extra-standard-library = ["typing_extensions"]

[tool.pytest.ini_options]
pythonpath = "."
addopts = ["--import-mode=importlib", "--doctest-modules", "--ignore-glob=docs/examples/**", "--ignore-glob=examples/**"]
addopts = ["--import-mode=importlib",
"--doctest-modules",
"--ignore-glob=docs/examples/**",
"--ignore-glob=examples/**",
"--ignore-glob=tests/utils/benchmark_*",
"--ignore=tests/benchmark_tests",
"-m", "not benchmark"
]

[tool.ruff]
extend-exclude = ["*{_pb2,_pb2_grpc}.{py,pyi}"]
Expand Down
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe just "benchmarks" instead of "benchmark_tests"?

Empty file.
167 changes: 167 additions & 0 deletions tests/benchmark_tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from __future__ import annotations

import io
from pathlib import Path

import pytest
from rdflib import Dataset, Graph


def pytest_addoption(parser: pytest.Parser) -> None:
g = parser.getgroup("benchmark")
g.addoption("--in-nt", type=str, help="path to N-Triples file.")
g.addoption("--in-nq", type=str, help="path to N-Quads file.")
g.addoption(
"--in-jelly-triples",
type=str,
default=None,
help="optional Jelly triples file; if none, generated in-memory from nt file.",
)
g.addoption(
"--in-jelly-quads",
type=str,
default=None,
help="optional Jelly quads file; if none, generated in-memory from nq slice.",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is an "nq slice"?

)

g.addoption(
"--limit-statements",
type=int,
default=5_000_000,
help="first N statements from input.",
)
g.addoption(
"--warmup-rounds",
type=int,
default=5,
help="warmup rounds, not counted to evaluation.",
)
g.addoption("--rounds", type=int, default=10, help="measured rounds.")
g.addoption("--iterations", type=int, default=1, help="iterations per round.")


def _slice_lines_to_bytes(path: Path, limit: int) -> bytes:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are no comments here or anywhere else, again. This makes the code rather hard to review.

Please make this code readable, and then I will review it again.

buf = io.BytesIO()
with path.open("rb") as f:
for i, line in enumerate(f):
buf.write(line)
if i + 1 >= limit:
break
return buf.getvalue()


@pytest.fixture(scope="session")
def limit_statements(request: pytest.FixtureRequest) -> int:
return int(request.config.getoption("--limit-statements"))


@pytest.fixture(scope="session")
def pedantic_cfg(request: pytest.FixtureRequest) -> dict[str, int]:
return {
"warmup_rounds": int(request.config.getoption("--warmup-rounds")),
"rounds": int(request.config.getoption("--rounds")),
"iterations": int(request.config.getoption("--iterations")),
}


@pytest.fixture(scope="session")
def nt_path(request: pytest.FixtureRequest) -> Path:
opt = request.config.getoption("--in-nt")
assert opt, "--in-nt is required"
p = Path(opt)
assert p.exists(), f"--in-nt not found: {p}"
return p


@pytest.fixture(scope="session")
def nq_path(request: pytest.FixtureRequest) -> Path:
opt = request.config.getoption("--in-nq")
assert opt, "--in-nq is required"
p = Path(opt)
assert p.exists(), f"--in-nq not found: {p}"
return p


@pytest.fixture(scope="session")
def jelly_triples_path(request: pytest.FixtureRequest) -> Path | None:
opt = request.config.getoption("--in-jelly-triples")
return Path(opt) if opt else None


@pytest.fixture(scope="session")
def jelly_quads_path(request: pytest.FixtureRequest) -> Path | None:
opt = request.config.getoption("--in-jelly-quads")
return Path(opt) if opt else None


@pytest.fixture(scope="session")
def nt_bytes_sliced(nt_path: Path, limit_statements: int) -> bytes:
return _slice_lines_to_bytes(nt_path, limit_statements)


@pytest.fixture(scope="session")
def nq_bytes_sliced(nq_path: Path, limit_statements: int) -> bytes:
return _slice_lines_to_bytes(nq_path, limit_statements)


@pytest.fixture(scope="session")
def nt_graph(nt_bytes_sliced: bytes) -> Graph:
g = Graph()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you even use Graph? For buffering in-memory you must use an array of statements, otherwise you will get nonsensical results. Same with Dataset, of course.

g.parse(data=nt_bytes_sliced, format="nt")
return g


@pytest.fixture(scope="session")
def nq_dataset(nq_bytes_sliced: bytes) -> Dataset:
ds = Dataset()
ds.parse(data=nq_bytes_sliced, format="nquads")
return ds


@pytest.fixture(scope="session")
def jelly_triples_bytes(jelly_triples_path: Path | None, nt_graph: Graph) -> bytes:
if jelly_triples_path and jelly_triples_path.exists():
return jelly_triples_path.read_bytes()
return nt_graph.serialize(destination=None, format="jelly", encoding="utf-8")


@pytest.fixture(scope="session")
def jelly_quads_bytes(jelly_quads_path: Path | None, nq_dataset: Dataset) -> bytes:
if jelly_quads_path and jelly_quads_path.exists():
return jelly_quads_path.read_bytes()
return nq_dataset.serialize(destination=None, format="jelly", encoding="utf-8")


def pytest_configure(config: pytest.Config) -> None:
config.addinivalue_line("markers", "benchmark: flat ser/des benchmarks")
config.addinivalue_line(
"markers", "triples: triples-only benchmarks (NT/Jelly-triples)"
)
config.addinivalue_line("markers", "quads: quads-only benchmarks (NQ/Jelly-quads)")


def pytest_collection_modifyitems(
config: pytest.Config, items: list[pytest.Item]
) -> None:
has_nt = bool(config.getoption("--in-nt"))
has_nq = bool(config.getoption("--in-nq"))

deselected: list[pytest.Item] = []
selected: list[pytest.Item] = []

for it in items:
is_triples = it.get_closest_marker("triples") is not None
is_quads = it.get_closest_marker("quads") is not None

if is_triples and not has_nt:
deselected.append(it)
continue
if is_quads and not has_nq:
deselected.append(it)
continue

selected.append(it)

if deselected:
config.hook.pytest_deselected(items=deselected)
items[:] = selected
72 changes: 72 additions & 0 deletions tests/benchmark_tests/jelly_rdflib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from __future__ import annotations

import io
from contextlib import suppress

from rdflib import Dataset, Graph

from tests.utils.benchmark_io_utils import NullCounter


def parse_nt_bytes(nt_bytes: bytes) -> Graph:
g = Graph()
g.parse(data=nt_bytes, format="nt")
return g


def parse_jelly_triples_bytes(jelly_bytes: bytes) -> Graph:
g = Graph()
g.parse(data=jelly_bytes, format="jelly")
return g


def parse_nq_bytes(nq_bytes: bytes) -> Dataset:
ds = Dataset()
ds.parse(data=nq_bytes, format="nquads")
return ds


def parse_jelly_quads_bytes(jelly_bytes: bytes) -> Dataset:
ds = Dataset()
ds.parse(data=jelly_bytes, format="jelly")
return ds


def serialize_nt_stream(g: Graph) -> int:
sink = NullCounter()
buf = io.BufferedWriter(sink)
g.serialize(destination=buf, format="nquads", encoding="utf-8")
buf.flush()
with suppress(io.UnsupportedOperation, ValueError):
buf.detach()
return sink.n


def serialize_jelly_triples_stream(g: Graph) -> int:
sink = NullCounter()
buf = io.BufferedWriter(sink)
g.serialize(destination=buf, format="jelly", encoding="utf-8")
buf.flush()
with suppress(io.UnsupportedOperation, ValueError):
buf.detach()
return sink.n


def serialize_nq_stream(ds: Dataset) -> int:
sink = NullCounter()
buf = io.BufferedWriter(sink)
ds.serialize(destination=buf, format="nquads", encoding="utf-8")
buf.flush()
with suppress(io.UnsupportedOperation, ValueError):
buf.detach()
return sink.n


def serialize_jelly_quads_stream(ds: Dataset) -> int:
sink = NullCounter()
buf = io.BufferedWriter(sink)
ds.serialize(destination=buf, format="jelly", encoding="utf-8")
buf.flush()
with suppress(io.UnsupportedOperation, ValueError):
buf.detach()
return sink.n
67 changes: 67 additions & 0 deletions tests/benchmark_tests/test_flat_deserialize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

import pytest
from pytest_benchmark.fixture import BenchmarkFixture # type: ignore[import-not-found]

from tests.benchmark_tests.jelly_rdflib import (
parse_jelly_quads_bytes,
parse_jelly_triples_bytes,
parse_nq_bytes,
parse_nt_bytes,
)
from tests.utils.benchmark_throughput import print_throughput

pytest.importorskip(
"pytest_benchmark",
reason="Install bench dependency group and run with -m benchmark",
)

pytestmark = pytest.mark.benchmark


@pytest.mark.triples
def test_flat_triples_deserialize_nt(
benchmark: BenchmarkFixture,
nt_bytes_sliced: bytes,
pedantic_cfg: dict[str, int],
limit_statements: int,
) -> None:
benchmark.pedantic(parse_nt_bytes, args=(nt_bytes_sliced,), **pedantic_cfg)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are measuring here not the parsing speed, but the speed with which rdflib can insert stuff into the Graph. This is meaningless. You must only iterate over the resulting triples/quads, nothing else.

print_throughput(benchmark, limit_statements, "triples: parse NT")


@pytest.mark.triples
def test_flat_triples_deserialize_jelly(
benchmark: BenchmarkFixture,
jelly_triples_bytes: bytes,
pedantic_cfg: dict[str, int],
limit_statements: int,
) -> None:
benchmark.pedantic(
parse_jelly_triples_bytes, args=(jelly_triples_bytes,), **pedantic_cfg
)
print_throughput(benchmark, limit_statements, "triples: parse Jelly")


@pytest.mark.quads
def test_flat_quads_deserialize_nq(
benchmark: BenchmarkFixture,
nq_bytes_sliced: bytes,
pedantic_cfg: dict[str, int],
limit_statements: int,
) -> None:
benchmark.pedantic(parse_nq_bytes, args=(nq_bytes_sliced,), **pedantic_cfg)
print_throughput(benchmark, limit_statements, "quads: parse NQ")


@pytest.mark.quads
def test_flat_quads_deserialize_jelly(
benchmark: BenchmarkFixture,
jelly_quads_bytes: bytes,
pedantic_cfg: dict[str, int],
limit_statements: int,
) -> None:
benchmark.pedantic(
parse_jelly_quads_bytes, args=(jelly_quads_bytes,), **pedantic_cfg
)
print_throughput(benchmark, limit_statements, "quads: parse Jelly")
Loading
Loading