Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ $ ./jelly-cli rdf from-jelly input.jelly --out-format=ttl > output.ttl
You can specify most well-known formats supported by Apache Jena, but also a custom Jelly-Text format.
Jelly-Text is a human-readable translation of Jelly binary. It's not meant for machine consumption. It is useful for debugging and inspecting Jelly files.

### Transcode Jelly files

The `rdf transcode` command turns one or more input Jelly streams into a single output stream. It's extremely fast, using a dedicated transcoding algorithm, but the output stream's options must the same or greater than the inputs.

```shell
$ ./jelly-cli rdf transcode input.jelly > output.jelly
```

### Inspect Jelly files

To inspect a Jelly file and get basic information describing its contents, such as stream options or number of triples in the file, run
Expand Down Expand Up @@ -74,6 +82,7 @@ Use the `--help` option to learn more about all the available settings:
```shell
$ ./jelly-cli rdf to-jelly --help
$ ./jelly-cli rdf from-jelly --help
$ ./jelly-cli rdf transcode --help
$ ./jelly-cli rdf inspect --help
$ ./jelly-cli rdf validate --help
```
Expand Down
1 change: 1 addition & 0 deletions src/main/scala/eu/neverblink/jelly/cli/App.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ object App extends CommandsEntryPoint:
Version,
RdfFromJelly,
RdfToJelly,
RdfTranscode,
RdfInspect,
RdfValidate,
)
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ case class RdfFromJellyOptions(
@ExtraName("out-format") outputFormat: Option[String] = None,
) extends HasJellyCommandOptions

object RdfFromJelly extends RdfTranscodeCommand[RdfFromJellyOptions, RdfFormat.Writeable]:
object RdfFromJelly extends RdfSerDesCommand[RdfFromJellyOptions, RdfFormat.Writeable]:

override def names: List[List[String]] = List(
List("rdf", "from-jelly"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ import java.io.{InputStream, OutputStream}

/** This abstract class is responsible for the common logic in both RDF parsing commands
*/
abstract class RdfTranscodeCommand[T <: HasJellyCommandOptions: {Parser, Help}, F <: RdfFormat](
using tt: TypeTest[RdfFormat, F],
abstract class RdfSerDesCommand[T <: HasJellyCommandOptions: {Parser, Help}, F <: RdfFormat](using
tt: TypeTest[RdfFormat, F],
) extends JellyCommand[T]:

override final def group = "rdf"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ case class RdfToJellyOptions(
delimited: Boolean = true,
) extends HasJellyCommandOptions

object RdfToJelly extends RdfTranscodeCommand[RdfToJellyOptions, RdfFormat.Readable]:
object RdfToJelly extends RdfSerDesCommand[RdfToJellyOptions, RdfFormat.Readable]:

override def names: List[List[String]] = List(
List("rdf", "to-jelly"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package eu.neverblink.jelly.cli.command.rdf

import caseapp.*
import eu.neverblink.jelly.cli.*
import eu.neverblink.jelly.cli.command.rdf.util.*
import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamOptions
import eu.ostrzyciel.jelly.core.{JellyOptions, ProtoTranscoder}

import java.io.{InputStream, OutputStream}

@HelpMessage(
"Quickly transcodes the input Jelly file into another Jelly file.\n" +
"If no input file is specified, the input is read from stdin.\n" +
"The input may be a concatenation of multiple Jelly streams.\n" +
"Currently only frame-by-frame transcoding is supported.\n" +
"The output's options must be greater than or equal to the input options.\n" +
"Note: this command works in a streaming manner and scales well to large files.",
)
@ArgsName("<file-to-transcode>")
case class RdfTranscodeOptions(
@Recurse
common: JellyCommandOptions = JellyCommandOptions(),
@HelpMessage(
"Output file to write the Jelly data to. If not specified, the output is written to stdout.",
)
@ExtraName("to") outputFile: Option[String] = None,
@Recurse
jellySerializationOptions: RdfJellySerializationOptions = RdfJellySerializationOptions(),
// TODO: supported input options
// TODO: make it possible to not only frame-by-frame transcode, but also regroup the rows
// TODO: make it possible to do full transcoding (with Jena parsing)
) extends HasJellyCommandOptions

object RdfTranscode extends JellyCommand[RdfTranscodeOptions]:
override def names: List[List[String]] = List(
List("rdf", "transcode"),
)

override final def group = "rdf"

override def doRun(options: RdfTranscodeOptions, remainingArgs: RemainingArgs): Unit =
val outOpt = options.jellySerializationOptions.asRdfStreamOptions
val (inputStream, outputStream) =
getIoStreamsFromOptions(remainingArgs.remaining.headOption, options.outputFile)
jellyToJelly(inputStream, outputStream, outOpt)

/** Transcodes the input Jelly stream into another Jelly stream.
* @param inputStream
* input
* @param outputStream
* output
* @param outOpt
* user-defined options for the output
*/
private def jellyToJelly(
inputStream: InputStream,
outputStream: OutputStream,
outOpt: RdfStreamOptions,
): Unit =
val in = JellyUtil.iterateRdfStream(inputStream).buffered
val head = in.head
if head.rows.isEmpty then throw CriticalException("Empty input stream")
if !head.rows.head.row.isOptions then
throw CriticalException("First input row is not an options row")
val inOpt = head.rows.head.row.options

val transcoder = ProtoTranscoder.fastMergingTranscoder(
supportedInputOptions = JellyOptions.defaultSupportedOptions,
outputOptions = outOpt.copy(
// There is no way to specify the physical type with options currently.
// Just use the one from the input.
physicalType = inOpt.physicalType,
logicalType =
if outOpt.logicalType.isUnspecified then inOpt.logicalType else outOpt.logicalType,
),
)

in.map(transcoder.ingestFrame)
.foreach(_.writeDelimitedTo(outputStream))
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package eu.neverblink.jelly.cli.command.rdf

import eu.neverblink.jelly.cli.command.helpers.TestFixtureHelper
import eu.neverblink.jelly.cli.command.rdf.util.{JellyUtil, RdfJellySerializationOptions}
import eu.ostrzyciel.jelly.core.proto.v1.*
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

import java.io.{ByteArrayInputStream, FileInputStream}

class RdfTranscodeSpec extends AnyWordSpec, Matchers, TestFixtureHelper:
protected val testCardinality: Int = 36

private val defaultOpt = RdfJellySerializationOptions().asRdfStreamOptions

private def checkOutputWithDefaultOptions(b: Array[Byte]): Unit =
val outF = RdfStreamFrame.parseDelimitedFrom(ByteArrayInputStream(b))
outF.get.rows.size should be > 36
val opt = outF.get.rows.head.row.options
opt.physicalType should be(PhysicalStreamType.TRIPLES)
opt.logicalType should be(LogicalStreamType.FLAT_TRIPLES)
opt.maxNameTableSize should be(defaultOpt.maxNameTableSize)
opt.maxPrefixTableSize should be(defaultOpt.maxPrefixTableSize)
opt.maxDatatypeTableSize should be(defaultOpt.maxDatatypeTableSize)
opt.rdfStar should be(defaultOpt.rdfStar)
opt.generalizedStatements should be(defaultOpt.generalizedStatements)

"transcode input file with no additional options" in withFullJellyFile { j =>
RdfTranscode.runTestCommand(List("rdf", "transcode", j))
val outB = RdfTranscode.getOutBytes
checkOutputWithDefaultOptions(outB)
}

"transcode stdin with no additional options" in withFullJellyFile { j =>
val inBytes = FileInputStream(j).readAllBytes()
RdfTranscode.setStdIn(ByteArrayInputStream(inBytes))
RdfTranscode.runTestCommand(List("rdf", "transcode"))
val outB = RdfTranscode.getOutBytes
checkOutputWithDefaultOptions(outB)
}

"transcode input file to output file with no additional options" in withEmptyJellyFile { jOut =>
withFullJellyFile { jIn =>
RdfTranscode.runTestCommand(List("rdf", "transcode", "--to", jOut, jIn))
val outB = FileInputStream(jOut).readAllBytes()
checkOutputWithDefaultOptions(outB)
}
}

"merge 100 input streams" in withFullJellyFile { j =>
val inBytes1 = FileInputStream(j).readAllBytes()
val inBytes = (0 until 100).map(_ => inBytes1).reduce(_ ++ _)
RdfTranscode.setStdIn(ByteArrayInputStream(inBytes))
RdfTranscode.runTestCommand(List("rdf", "transcode"))
val outB = RdfTranscode.getOutBytes
checkOutputWithDefaultOptions(outB)
val outFrames = JellyUtil.iterateRdfStream(ByteArrayInputStream(outB)).toSeq
outFrames.size should be(100)
outFrames.foreach { f =>
f.rows.size should be >= testCardinality
}
}

"transcode input file with changed output options" in withFullJellyFile { j =>
RdfTranscode.runTestCommand(
List(
"rdf",
"transcode",
"--opt.max-prefix-table-size=600",
"--opt.logical-type=GRAPHS",
j,
),
)
val outB = RdfTranscode.getOutBytes
val f = RdfStreamFrame.parseDelimitedFrom(ByteArrayInputStream(outB)).get
f.rows.size should be > testCardinality
val opt = f.rows.head.row.options
opt.maxPrefixTableSize should be(600)
opt.physicalType should be(PhysicalStreamType.TRIPLES)
opt.logicalType should be(LogicalStreamType.GRAPHS)
}