diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala index cb67aff..0eaa138 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala @@ -10,12 +10,13 @@ import eu.neverblink.jelly.convert.jena.JenaConverterFactory import eu.neverblink.jelly.convert.jena.riot.{JellyFormatVariant, JellyLanguage, JellyStreamWriter} import eu.neverblink.jelly.core.{JellyOptions, RdfProtoDeserializationError} import eu.neverblink.jelly.core.proto.google.v1 as google -import eu.neverblink.jelly.core.proto.v1.{LogicalStreamType, PhysicalStreamType, RdfStreamOptions} +import eu.neverblink.jelly.core.proto.v1.* +import eu.neverblink.jelly.core.utils.IoUtils import org.apache.jena.riot.lang.LabelToNode import org.apache.jena.riot.system.StreamRDFWriter import org.apache.jena.riot.{Lang, RDFParser, RIOT} -import java.io.{BufferedReader, InputStream, InputStreamReader, OutputStream} +import java.io.{BufferedReader, FileInputStream, InputStream, InputStreamReader, OutputStream} import scala.util.Using object RdfToJellyPrint extends RdfCommandPrintUtil[RdfFormat.Readable]: @@ -45,6 +46,10 @@ case class RdfToJellyOptions( @ExtraName("in-format") inputFormat: Option[String] = None, @Recurse jellySerializationOptions: RdfJellySerializationOptions = RdfJellySerializationOptions(), + @HelpMessage( + "Jelly file to copy serialization options from. Options can be overridden with command line --opt.* options. Default: (unset)", + ) + optionsFrom: Option[String] = None, @HelpMessage( "Target number of rows per frame – the writer may slightly exceed that. Default: 256", ) @@ -72,8 +77,20 @@ object RdfToJelly extends RdfSerDesCommand[RdfToJellyOptions, RdfFormat.Readable val defaultAction: (InputStream, OutputStream) => Unit = langToJelly(RdfFormat.NQuads.jenaLang, _, _) + private def loadOptionsFromFile(filename: String): RdfStreamOptions = + val inputStream = new FileInputStream(filename) + val response = IoUtils.autodetectDelimiting(inputStream) + val frame = + if response.isDelimited then Using(response.newInput())(RdfStreamFrame.parseDelimitedFrom) + else Using(response.newInput())(RdfStreamFrame.parseFrom) + + frame.get.getRows.iterator().next().getOptions + override def doRun(options: RdfToJellyOptions, remainingArgs: RemainingArgs): Unit = // Infer before touching options + options.optionsFrom.map(loadOptionsFromFile).foreach( + options.jellySerializationOptions.setOptions, + ) options.jellySerializationOptions.inferGeneralized( options.inputFormat, remainingArgs.remaining.headOption, diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfJellySerializationOptions.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfJellySerializationOptions.scala index 6bf33d3..2884bb9 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfJellySerializationOptions.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfJellySerializationOptions.scala @@ -6,28 +6,37 @@ import eu.neverblink.jelly.core.proto.v1.{LogicalStreamType, PhysicalStreamType, import eu.neverblink.jelly.core.utils.LogicalStreamTypeUtils import eu.neverblink.jelly.core.JellyOptions +private val `default.opt.streamName`: String = "" +private val `default.opt.rdfStar`: Boolean = true +private val `default.opt.maxNameTableSize`: Int = JellyOptions.BIG_STRICT.getMaxNameTableSize +private val `default.opt.maxPrefixTableSize`: Int = JellyOptions.BIG_STRICT.getMaxPrefixTableSize +private val `default.opt.maxDatatypeTableSize`: Int = + JellyOptions.BIG_STRICT.getMaxDatatypeTableSize + /** Options for serializing in Jelly-RDF */ case class RdfJellySerializationOptions( @HelpMessage("Name of the output stream (in metadata). Default: (empty)") - `opt.streamName`: String = "", + `opt.streamName`: Option[String] = None, @HelpMessage( "Whether the stream may contain generalized triples, quads, or datasets. Default: (true for N-Triples/N-Quads and Jena binary formats, false otherwise)", ) `opt.generalizedStatements`: Option[Boolean] = None, - @HelpMessage("Whether the stream may contain RDF-star statements. Default: true") - `opt.rdfStar`: Boolean = true, @HelpMessage( - "Maximum size of the name lookup table. Default: " + JellyOptions.BIG_STRICT.getMaxNameTableSize, + "Whether the stream may contain RDF-star statements. Default: " + `default.opt.rdfStar`, + ) + `opt.rdfStar`: Option[Boolean] = None, + @HelpMessage( + "Maximum size of the name lookup table. Default: " + `default.opt.maxNameTableSize`, ) - `opt.maxNameTableSize`: Int = JellyOptions.BIG_STRICT.getMaxNameTableSize, + `opt.maxNameTableSize`: Option[Int] = None, @HelpMessage( - "Maximum size of the prefix lookup table. Default: " + JellyOptions.BIG_STRICT.getMaxPrefixTableSize, + "Maximum size of the prefix lookup table. Default: " + `default.opt.maxPrefixTableSize`, ) - `opt.maxPrefixTableSize`: Int = JellyOptions.BIG_STRICT.getMaxPrefixTableSize, + `opt.maxPrefixTableSize`: Option[Int] = None, @HelpMessage( - "Maximum size of the datatype lookup table. Default: " + JellyOptions.BIG_STRICT.getMaxDatatypeTableSize, + "Maximum size of the datatype lookup table. Default: " + `default.opt.maxDatatypeTableSize`, ) - `opt.maxDatatypeTableSize`: Int = JellyOptions.BIG_STRICT.getMaxDatatypeTableSize, + `opt.maxDatatypeTableSize`: Option[Int] = None, @HelpMessage( "Physical stream type. One of: TRIPLES, QUADS, GRAPHS. " + "Default: either TRIPLES or QUADS, depending on the input format.", @@ -41,8 +50,13 @@ case class RdfJellySerializationOptions( `opt.logicalType`: Option[String] = None, ): private object inferred: + var options: Option[RdfStreamOptions] = None var generalized: Boolean = false + def setOptions(rdfStreamOptions: RdfStreamOptions): Unit = inferred.options = Some( + rdfStreamOptions, + ) + def inferGeneralized(inputFormat: Option[String], filename: Option[String]): Unit = val explicitFormat = inputFormat.flatMap(RdfFormat.find) val implicitFormat = filename.flatMap(RdfFormat.inferFormat) @@ -52,7 +66,7 @@ case class RdfJellySerializationOptions( case _ => false } - lazy val asRdfStreamOptions: RdfStreamOptions = + private lazy val logicalType: Option[LogicalStreamType] = val logicalIri = `opt.logicalType` .map(_.trim).filter(_.nonEmpty) .map { @@ -72,7 +86,10 @@ case class RdfJellySerializationOptions( `opt.logicalType`.get, Some("Logical type must be either a full RDF-STaX IRI or a name like `FLAT_QUADS`"), ) - val physicalType = `opt.physicalType`.map(_.trim.toUpperCase) match + logicalType + + private lazy val physicalType: PhysicalStreamType = + `opt.physicalType`.map(_.trim.toUpperCase) match case Some("TRIPLES") => PhysicalStreamType.TRIPLES case Some("QUADS") => PhysicalStreamType.QUADS case Some("GRAPHS") => PhysicalStreamType.GRAPHS @@ -83,12 +100,41 @@ case class RdfJellySerializationOptions( Some("Physical type must be one of: TRIPLES, QUADS, GRAPHS"), ) case None => PhysicalStreamType.UNSPECIFIED + + private def makeStreamOptions(): RdfStreamOptions = RdfStreamOptions.newInstance() - .setStreamName(`opt.streamName`) + .setStreamName(`opt.streamName`.getOrElse(`default.opt.streamName`)) .setGeneralizedStatements(`opt.generalizedStatements`.getOrElse(inferred.generalized)) - .setRdfStar(`opt.rdfStar`) - .setMaxNameTableSize(`opt.maxNameTableSize`) - .setMaxPrefixTableSize(`opt.maxPrefixTableSize`) - .setMaxDatatypeTableSize(`opt.maxDatatypeTableSize`) + .setRdfStar(`opt.rdfStar`.getOrElse(`default.opt.rdfStar`)) + .setMaxNameTableSize(`opt.maxNameTableSize`.getOrElse(`default.opt.maxNameTableSize`)) + .setMaxPrefixTableSize(`opt.maxPrefixTableSize`.getOrElse(`default.opt.maxPrefixTableSize`)) + .setMaxDatatypeTableSize( + `opt.maxDatatypeTableSize`.getOrElse(`default.opt.maxDatatypeTableSize`), + ) .setPhysicalType(physicalType) .setLogicalType(logicalType.getOrElse(LogicalStreamType.UNSPECIFIED)) + + private lazy val optionsFromFileWithOverrides: Option[RdfStreamOptions] = + inferred.options.map(x => { + val cloned = x.clone() + if `opt.generalizedStatements`.isDefined then + cloned.setGeneralizedStatements(`opt.generalizedStatements`.get) + if `opt.streamName`.isDefined then // comment to stop scalafmt from making this a mess + cloned.setStreamName(`opt.streamName`.get) + if `opt.rdfStar`.isDefined then // comment to stop scalafmt from making this a mess + cloned.setRdfStar(`opt.rdfStar`.get) + if `opt.maxNameTableSize`.isDefined then + cloned.setMaxNameTableSize(`opt.maxNameTableSize`.get) + if `opt.maxPrefixTableSize`.isDefined then + cloned.setMaxPrefixTableSize(`opt.maxPrefixTableSize`.get) + if `opt.maxDatatypeTableSize`.isDefined then + cloned.setMaxDatatypeTableSize(`opt.maxDatatypeTableSize`.get) + if `opt.physicalType`.isDefined then // comment to stop scalafmt from making this a mess + cloned.setPhysicalType(physicalType) + if `opt.logicalType`.isDefined then + cloned.setLogicalType(logicalType.getOrElse(LogicalStreamType.UNSPECIFIED)) + cloned + }) + + lazy val asRdfStreamOptions: RdfStreamOptions = + optionsFromFileWithOverrides.getOrElse(makeStreamOptions()) diff --git a/src/test/resources/optionsNonDelimited.jelly b/src/test/resources/optionsNonDelimited.jelly new file mode 100644 index 0000000..9076949 --- /dev/null +++ b/src/test/resources/optionsNonDelimited.jelly @@ -0,0 +1,11 @@ + + + H P–X px + Rhttp://example.org/resource/ +Jr1 + Rhttp://example.org/property/ +J  announcedAt + Rhttp://example.org/location/ +Jl1 + +*J \ No newline at end of file diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala index e098417..b7ed437 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala @@ -454,6 +454,82 @@ class RdfToJellySpec extends AnyWordSpec with TestFixtureHelper with Matchers: }, jenaLang = RDFLanguages.NTRIPLES, ) + + "loading options from another file" in withSpecificJellyFile( + optionsFile => + withFullJenaFile( + jenaFile => { + RdfToJelly.runTestCommand( + List( + "rdf", + "to-jelly", + "--options-from", + optionsFile, + jenaFile, + ), + ) + val frames = readJellyFile(new FileInputStream(optionsFile)) + val opts = frames.head.getRows.asScala.head.getOptions + val newFrames = readJellyFile(new ByteArrayInputStream(RdfToJelly.getOutBytes)) + val newOpts = newFrames.head.getRows.asScala.head.getOptions + opts should equal(newOpts) + + }, + jenaLang = RDFLanguages.NTRIPLES, + ), + fileName = "options.jelly", + ) + + "loading options from another and overriding" in withSpecificJellyFile( + optionsFile => + withFullJenaFile( + jenaFile => { + RdfToJelly.runTestCommand( + List( + "rdf", + "to-jelly", + "--options-from", + optionsFile, + jenaFile, + "--opt.rdf-star", + "false", + ), + ) + val frames = readJellyFile(new FileInputStream(optionsFile)) + val opts = frames.head.getRows.asScala.head.getOptions + val newFrames = readJellyFile(new ByteArrayInputStream(RdfToJelly.getOutBytes)) + val newOpts = newFrames.head.getRows.asScala.head.getOptions + opts shouldNot equal(newOpts) + opts.clone().setRdfStar(true) should equal(newOpts) + }, + jenaLang = RDFLanguages.NTRIPLES, + ), + fileName = "options.jelly", + ) + + "loading options from non-delimited file" in withSpecificJellyFile( + optionsFile => + withFullJenaFile( + jenaFile => { + RdfToJelly.runTestCommand( + List( + "rdf", + "to-jelly", + "--options-from", + optionsFile, + jenaFile, + ), + ) + val frame = Using(new FileInputStream(optionsFile))(RdfStreamFrame.parseFrom).get + val opts = frame.getRows.asScala.head.getOptions + val newFrames = readJellyFile(new ByteArrayInputStream(RdfToJelly.getOutBytes)) + val newOpts = newFrames.head.getRows.asScala.head.getOptions + opts should equal(newOpts) + }, + jenaLang = RDFLanguages.NTRIPLES, + ), + fileName = "optionsNonDelimited.jelly", + ) } "Turtle" in { val input = DataGenHelper.generateJenaInputStream(testCardinality, RDFLanguages.TURTLE)