From 5fd404e65a22c97f570ae286fb884fa26d4a6e95 Mon Sep 17 00:00:00 2001
From: Ostrzyciel <ostrzycielnozyczek@gmail.com>
Date: Sat, 12 Jul 2025 18:11:31 +0200
Subject: [PATCH] rdf to-jelly: preserve blank node IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously blank node IDs were reassigned to new ones, drawing from some hashed in-memory pool. This unnecessarily wasted resources (we can keep the original IDs, they are fine for our use case), made a mess in the output, and could potentially lead to OOMs for very large files.

This somewhat speeds up the conversion for files that contain blank nodes. For OSM data, I saw ~10% better throughput in converting Turtle to Jelly.

It's still not amazingly fast, mostly due to the Turtle parser, but I'm hesitant to mess with it further – we may break something important.
---
 .../jelly/cli/command/rdf/RdfToJelly.scala    |  6 +++++-
 .../cli/command/rdf/RdfToJellySpec.scala      | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala
index 65f65ac..7fcde62 100644
--- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala
+++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala
@@ -10,6 +10,7 @@ import eu.neverblink.jelly.convert.jena.JenaConverterFactory
 import eu.neverblink.jelly.convert.jena.riot.{JellyFormatVariant, JellyLanguage, JellyStreamWriter}
 import eu.neverblink.jelly.core.proto.google.v1 as google
 import eu.neverblink.jelly.core.proto.v1.{LogicalStreamType, PhysicalStreamType, RdfStreamOptions}
+import org.apache.jena.riot.lang.LabelToNode
 import org.apache.jena.riot.system.StreamRDFWriter
 import org.apache.jena.riot.{Lang, RDFParser, RIOT}
 
@@ -159,7 +160,10 @@ object RdfToJelly extends RdfSerDesCommand[RdfToJellyOptions, RdfFormat.Readable
             .build()
           JellyStreamWriter(JenaConverterFactory.getInstance(), variant, outputStream)
 
-    RDFParser.source(inputStream).lang(jenaLang).parse(jellyWriter)
+    RDFParser.source(inputStream)
+      .lang(jenaLang)
+      .labelToNode(LabelToNode.createUseLabelAsGiven())
+      .parse(jellyWriter)
     jellyWriter.finish()
 
   /** Convert Jelly text to Jelly binary.
diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala
index 129b2c0..c6cf447 100644
--- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala
+++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala
@@ -78,6 +78,25 @@ class RdfToJellySpec extends AnyWordSpec with TestFixtureHelper with Matchers:
         content.containsAll(tripleModel.listStatements()) shouldBe true
       }
 
+      "preserve the original blank node IDs" in {
+        val inputString =
+          """_:b1 <http://a.com/p> _:b2 .
+            |_:b1 <http://a.com/p> _:b3 .
+            |""".stripMargin
+        val input = ByteArrayInputStream(inputString.getBytes)
+        RdfToJelly.setStdIn(input)
+        val (out, err) = RdfToJelly.runTestCommand(
+          List("rdf", "to-jelly", "--in-format", RdfFormat.NQuads.cliOptions.head),
+        )
+        val newIn = new ByteArrayInputStream(RdfToJelly.getOutBytes)
+        val content = translateJellyBack(newIn)
+        content.size() should be(2)
+        val statements = content.listStatements().asScala.toSeq
+        statements.flatMap(s => Seq(s.getSubject, s.getObject)).toSet
+          .map(_.asResource().getId.toString)
+          .toSet should be(Set("b1", "b2", "b3"))
+      }
+
       "input stream to output stream, generalized RDF (N-Triples)" in {
         val inputStream = new FileInputStream(getClass.getResource("/generalized.nt").getPath)
         RdfToJelly.setStdIn(inputStream)