-
Notifications
You must be signed in to change notification settings - Fork 6
Add benchmarks #327
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add benchmarks #327
Changes from 4 commits
d873906
9c32db4
da78079
770094c
688eb99
5bebf27
0461949
778bd88
14af2d6
a61fd90
414bfe9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe just "benchmarks" instead of "benchmark_tests"? |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,167 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import io | ||
| from pathlib import Path | ||
|
|
||
| import pytest | ||
| from rdflib import Dataset, Graph | ||
|
|
||
|
|
||
| def pytest_addoption(parser: pytest.Parser) -> None: | ||
| g = parser.getgroup("benchmark") | ||
| g.addoption("--in-nt", type=str, help="path to N-Triples file.") | ||
| g.addoption("--in-nq", type=str, help="path to N-Quads file.") | ||
| g.addoption( | ||
| "--in-jelly-triples", | ||
| type=str, | ||
| default=None, | ||
| help="optional Jelly triples file; if none, generated in-memory from nt file.", | ||
| ) | ||
| g.addoption( | ||
| "--in-jelly-quads", | ||
| type=str, | ||
| default=None, | ||
| help="optional Jelly quads file; if none, generated in-memory from nq slice.", | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is an "nq slice"? |
||
| ) | ||
|
|
||
| g.addoption( | ||
| "--limit-statements", | ||
| type=int, | ||
| default=5_000_000, | ||
| help="first N statements from input.", | ||
| ) | ||
| g.addoption( | ||
| "--warmup-rounds", | ||
| type=int, | ||
| default=5, | ||
| help="warmup rounds, not counted to evaluation.", | ||
| ) | ||
| g.addoption("--rounds", type=int, default=10, help="measured rounds.") | ||
| g.addoption("--iterations", type=int, default=1, help="iterations per round.") | ||
|
|
||
|
|
||
| def _slice_lines_to_bytes(path: Path, limit: int) -> bytes: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are no comments here or anywhere else, again. This makes the code rather hard to review. Please make this code readable, and then I will review it again. |
||
| buf = io.BytesIO() | ||
| with path.open("rb") as f: | ||
| for i, line in enumerate(f): | ||
| buf.write(line) | ||
| if i + 1 >= limit: | ||
| break | ||
Ja-Gk-00 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return buf.getvalue() | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def limit_statements(request: pytest.FixtureRequest) -> int: | ||
| return int(request.config.getoption("--limit-statements")) | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def pedantic_cfg(request: pytest.FixtureRequest) -> dict[str, int]: | ||
| return { | ||
| "warmup_rounds": int(request.config.getoption("--warmup-rounds")), | ||
| "rounds": int(request.config.getoption("--rounds")), | ||
| "iterations": int(request.config.getoption("--iterations")), | ||
| } | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def nt_path(request: pytest.FixtureRequest) -> Path: | ||
| opt = request.config.getoption("--in-nt") | ||
| assert opt, "--in-nt is required" | ||
| p = Path(opt) | ||
| assert p.exists(), f"--in-nt not found: {p}" | ||
| return p | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def nq_path(request: pytest.FixtureRequest) -> Path: | ||
| opt = request.config.getoption("--in-nq") | ||
| assert opt, "--in-nq is required" | ||
| p = Path(opt) | ||
| assert p.exists(), f"--in-nq not found: {p}" | ||
| return p | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def jelly_triples_path(request: pytest.FixtureRequest) -> Path | None: | ||
| opt = request.config.getoption("--in-jelly-triples") | ||
| return Path(opt) if opt else None | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def jelly_quads_path(request: pytest.FixtureRequest) -> Path | None: | ||
| opt = request.config.getoption("--in-jelly-quads") | ||
| return Path(opt) if opt else None | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def nt_bytes_sliced(nt_path: Path, limit_statements: int) -> bytes: | ||
| return _slice_lines_to_bytes(nt_path, limit_statements) | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def nq_bytes_sliced(nq_path: Path, limit_statements: int) -> bytes: | ||
| return _slice_lines_to_bytes(nq_path, limit_statements) | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def nt_graph(nt_bytes_sliced: bytes) -> Graph: | ||
| g = Graph() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you even use |
||
| g.parse(data=nt_bytes_sliced, format="nt") | ||
| return g | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def nq_dataset(nq_bytes_sliced: bytes) -> Dataset: | ||
| ds = Dataset() | ||
| ds.parse(data=nq_bytes_sliced, format="nquads") | ||
| return ds | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def jelly_triples_bytes(jelly_triples_path: Path | None, nt_graph: Graph) -> bytes: | ||
| if jelly_triples_path and jelly_triples_path.exists(): | ||
| return jelly_triples_path.read_bytes() | ||
| return nt_graph.serialize(destination=None, format="jelly", encoding="utf-8") | ||
|
|
||
|
|
||
| @pytest.fixture(scope="session") | ||
| def jelly_quads_bytes(jelly_quads_path: Path | None, nq_dataset: Dataset) -> bytes: | ||
| if jelly_quads_path and jelly_quads_path.exists(): | ||
| return jelly_quads_path.read_bytes() | ||
| return nq_dataset.serialize(destination=None, format="jelly", encoding="utf-8") | ||
|
|
||
|
|
||
| def pytest_configure(config: pytest.Config) -> None: | ||
| config.addinivalue_line("markers", "benchmark: flat ser/des benchmarks") | ||
| config.addinivalue_line( | ||
| "markers", "triples: triples-only benchmarks (NT/Jelly-triples)" | ||
| ) | ||
| config.addinivalue_line("markers", "quads: quads-only benchmarks (NQ/Jelly-quads)") | ||
|
|
||
|
|
||
| def pytest_collection_modifyitems( | ||
| config: pytest.Config, items: list[pytest.Item] | ||
| ) -> None: | ||
| has_nt = bool(config.getoption("--in-nt")) | ||
| has_nq = bool(config.getoption("--in-nq")) | ||
|
|
||
| deselected: list[pytest.Item] = [] | ||
| selected: list[pytest.Item] = [] | ||
|
|
||
| for it in items: | ||
| is_triples = it.get_closest_marker("triples") is not None | ||
| is_quads = it.get_closest_marker("quads") is not None | ||
|
|
||
| if is_triples and not has_nt: | ||
| deselected.append(it) | ||
| continue | ||
| if is_quads and not has_nq: | ||
| deselected.append(it) | ||
| continue | ||
|
|
||
| selected.append(it) | ||
|
|
||
| if deselected: | ||
| config.hook.pytest_deselected(items=deselected) | ||
| items[:] = selected | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import io | ||
| from contextlib import suppress | ||
|
|
||
| from rdflib import Dataset, Graph | ||
|
|
||
| from tests.utils.benchmark_io_utils import NullCounter | ||
|
|
||
|
|
||
| def parse_nt_bytes(nt_bytes: bytes) -> Graph: | ||
| g = Graph() | ||
| g.parse(data=nt_bytes, format="nt") | ||
| return g | ||
|
|
||
|
|
||
| def parse_jelly_triples_bytes(jelly_bytes: bytes) -> Graph: | ||
| g = Graph() | ||
| g.parse(data=jelly_bytes, format="jelly") | ||
| return g | ||
|
|
||
|
|
||
| def parse_nq_bytes(nq_bytes: bytes) -> Dataset: | ||
| ds = Dataset() | ||
| ds.parse(data=nq_bytes, format="nquads") | ||
| return ds | ||
|
|
||
|
|
||
| def parse_jelly_quads_bytes(jelly_bytes: bytes) -> Dataset: | ||
| ds = Dataset() | ||
| ds.parse(data=jelly_bytes, format="jelly") | ||
| return ds | ||
|
|
||
|
|
||
| def serialize_nt_stream(g: Graph) -> int: | ||
| sink = NullCounter() | ||
| buf = io.BufferedWriter(sink) | ||
| g.serialize(destination=buf, format="nquads", encoding="utf-8") | ||
| buf.flush() | ||
| with suppress(io.UnsupportedOperation, ValueError): | ||
| buf.detach() | ||
| return sink.n | ||
|
|
||
|
|
||
| def serialize_jelly_triples_stream(g: Graph) -> int: | ||
| sink = NullCounter() | ||
| buf = io.BufferedWriter(sink) | ||
| g.serialize(destination=buf, format="jelly", encoding="utf-8") | ||
| buf.flush() | ||
| with suppress(io.UnsupportedOperation, ValueError): | ||
| buf.detach() | ||
| return sink.n | ||
|
|
||
|
|
||
| def serialize_nq_stream(ds: Dataset) -> int: | ||
| sink = NullCounter() | ||
| buf = io.BufferedWriter(sink) | ||
| ds.serialize(destination=buf, format="nquads", encoding="utf-8") | ||
| buf.flush() | ||
| with suppress(io.UnsupportedOperation, ValueError): | ||
| buf.detach() | ||
| return sink.n | ||
|
|
||
|
|
||
| def serialize_jelly_quads_stream(ds: Dataset) -> int: | ||
| sink = NullCounter() | ||
| buf = io.BufferedWriter(sink) | ||
| ds.serialize(destination=buf, format="jelly", encoding="utf-8") | ||
| buf.flush() | ||
| with suppress(io.UnsupportedOperation, ValueError): | ||
| buf.detach() | ||
| return sink.n |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,67 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import pytest | ||
| from pytest_benchmark.fixture import BenchmarkFixture # type: ignore[import-not-found] | ||
|
|
||
| from tests.benchmark_tests.jelly_rdflib import ( | ||
| parse_jelly_quads_bytes, | ||
| parse_jelly_triples_bytes, | ||
| parse_nq_bytes, | ||
| parse_nt_bytes, | ||
| ) | ||
| from tests.utils.benchmark_throughput import print_throughput | ||
|
|
||
| pytest.importorskip( | ||
| "pytest_benchmark", | ||
| reason="Install bench dependency group and run with -m benchmark", | ||
| ) | ||
|
|
||
| pytestmark = pytest.mark.benchmark | ||
|
|
||
|
|
||
| @pytest.mark.triples | ||
| def test_flat_triples_deserialize_nt( | ||
| benchmark: BenchmarkFixture, | ||
| nt_bytes_sliced: bytes, | ||
| pedantic_cfg: dict[str, int], | ||
| limit_statements: int, | ||
| ) -> None: | ||
| benchmark.pedantic(parse_nt_bytes, args=(nt_bytes_sliced,), **pedantic_cfg) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are measuring here not the parsing speed, but the speed with which rdflib can insert stuff into the Graph. This is meaningless. You must only iterate over the resulting triples/quads, nothing else. |
||
| print_throughput(benchmark, limit_statements, "triples: parse NT") | ||
|
|
||
|
|
||
| @pytest.mark.triples | ||
| def test_flat_triples_deserialize_jelly( | ||
| benchmark: BenchmarkFixture, | ||
| jelly_triples_bytes: bytes, | ||
| pedantic_cfg: dict[str, int], | ||
| limit_statements: int, | ||
| ) -> None: | ||
| benchmark.pedantic( | ||
| parse_jelly_triples_bytes, args=(jelly_triples_bytes,), **pedantic_cfg | ||
| ) | ||
| print_throughput(benchmark, limit_statements, "triples: parse Jelly") | ||
|
|
||
|
|
||
| @pytest.mark.quads | ||
| def test_flat_quads_deserialize_nq( | ||
| benchmark: BenchmarkFixture, | ||
| nq_bytes_sliced: bytes, | ||
| pedantic_cfg: dict[str, int], | ||
| limit_statements: int, | ||
| ) -> None: | ||
| benchmark.pedantic(parse_nq_bytes, args=(nq_bytes_sliced,), **pedantic_cfg) | ||
| print_throughput(benchmark, limit_statements, "quads: parse NQ") | ||
|
|
||
|
|
||
| @pytest.mark.quads | ||
| def test_flat_quads_deserialize_jelly( | ||
| benchmark: BenchmarkFixture, | ||
| jelly_quads_bytes: bytes, | ||
| pedantic_cfg: dict[str, int], | ||
| limit_statements: int, | ||
| ) -> None: | ||
| benchmark.pedantic( | ||
| parse_jelly_quads_bytes, args=(jelly_quads_bytes,), **pedantic_cfg | ||
| ) | ||
| print_throughput(benchmark, limit_statements, "quads: parse Jelly") | ||
Uh oh!
There was an error while loading. Please reload this page.