From 5fd404e65a22c97f570ae286fb884fa26d4a6e95 Mon Sep 17 00:00:00 2001 From: Ostrzyciel Date: Sat, 12 Jul 2025 18:11:31 +0200 Subject: [PATCH] rdf to-jelly: preserve blank node IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously blank node IDs were reassigned to new ones, drawing from some hashed in-memory pool. This unnecessarily wasted resources (we can keep the original IDs, they are fine for our use case), made a mess in the output, and could potentially lead to OOMs for very large files. This somewhat speeds up the conversion for files that contain blank nodes. For OSM data, I saw ~10% better throughput in converting Turtle to Jelly. It's still not amazingly fast, mostly due to the Turtle parser, but I'm hesitant to mess with it further – we may break something important. --- .../jelly/cli/command/rdf/RdfToJelly.scala | 6 +++++- .../cli/command/rdf/RdfToJellySpec.scala | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala index 65f65ac..7fcde62 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala @@ -10,6 +10,7 @@ import eu.neverblink.jelly.convert.jena.JenaConverterFactory import eu.neverblink.jelly.convert.jena.riot.{JellyFormatVariant, JellyLanguage, JellyStreamWriter} import eu.neverblink.jelly.core.proto.google.v1 as google import eu.neverblink.jelly.core.proto.v1.{LogicalStreamType, PhysicalStreamType, RdfStreamOptions} +import org.apache.jena.riot.lang.LabelToNode import org.apache.jena.riot.system.StreamRDFWriter import org.apache.jena.riot.{Lang, RDFParser, RIOT} @@ -159,7 +160,10 @@ object RdfToJelly extends RdfSerDesCommand[RdfToJellyOptions, RdfFormat.Readable .build() JellyStreamWriter(JenaConverterFactory.getInstance(), variant, outputStream) - RDFParser.source(inputStream).lang(jenaLang).parse(jellyWriter) + RDFParser.source(inputStream) + .lang(jenaLang) + .labelToNode(LabelToNode.createUseLabelAsGiven()) + .parse(jellyWriter) jellyWriter.finish() /** Convert Jelly text to Jelly binary. diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala index 129b2c0..c6cf447 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala @@ -78,6 +78,25 @@ class RdfToJellySpec extends AnyWordSpec with TestFixtureHelper with Matchers: content.containsAll(tripleModel.listStatements()) shouldBe true } + "preserve the original blank node IDs" in { + val inputString = + """_:b1 _:b2 . + |_:b1 _:b3 . + |""".stripMargin + val input = ByteArrayInputStream(inputString.getBytes) + RdfToJelly.setStdIn(input) + val (out, err) = RdfToJelly.runTestCommand( + List("rdf", "to-jelly", "--in-format", RdfFormat.NQuads.cliOptions.head), + ) + val newIn = new ByteArrayInputStream(RdfToJelly.getOutBytes) + val content = translateJellyBack(newIn) + content.size() should be(2) + val statements = content.listStatements().asScala.toSeq + statements.flatMap(s => Seq(s.getSubject, s.getObject)).toSet + .map(_.asResource().getId.toString) + .toSet should be(Set("b1", "b2", "b3")) + } + "input stream to output stream, generalized RDF (N-Triples)" in { val inputStream = new FileInputStream(getClass.getResource("/generalized.nt").getPath) RdfToJelly.setStdIn(inputStream)