Skip to content

Commit 1953875

Browse files
authored
Implement the rdf transcode command (#81)
* Implement rdf transcode command * review fixes
1 parent b9ac1e3 commit 1953875

File tree

9 files changed

+191
-6
lines changed

9 files changed

+191
-6
lines changed

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ $ ./jelly-cli rdf from-jelly input.jelly --out-format=ttl > output.ttl
4141
You can specify most well-known formats supported by Apache Jena, but also a custom Jelly-Text format.
4242
Jelly-Text is a human-readable translation of Jelly binary. It's not meant for machine consumption. It is useful for debugging and inspecting Jelly files.
4343

44+
### Transcode Jelly files
45+
46+
The `rdf transcode` command turns one or more input Jelly streams into a single output stream. It's extremely fast, using a dedicated transcoding algorithm. However, the numerical values for each of the options in the output stream must be greater than or equal to those in the input stream(s).
47+
48+
```shell
49+
$ ./jelly-cli rdf transcode input.jelly > output.jelly
50+
```
51+
4452
### Inspect Jelly files
4553

4654
To inspect a Jelly file and get basic information describing its contents, such as stream options or number of triples in the file, run
@@ -74,6 +82,7 @@ Use the `--help` option to learn more about all the available settings:
7482
```shell
7583
$ ./jelly-cli rdf to-jelly --help
7684
$ ./jelly-cli rdf from-jelly --help
85+
$ ./jelly-cli rdf transcode --help
7786
$ ./jelly-cli rdf inspect --help
7887
$ ./jelly-cli rdf validate --help
7988
```

build.sbt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ resolvers +=
66
"Sonatype OSS Snapshots" at "https://s01.oss.sonatype.org/content/repositories/snapshots"
77

88
lazy val jenaV = "5.3.0"
9-
lazy val jellyV = "2.10.1"
9+
lazy val jellyV = "2.10.2"
1010

1111
addCommandAlias("fixAll", "scalafixAll; scalafmtAll")
1212

src/main/scala/eu/neverblink/jelly/cli/App.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ object App extends CommandsEntryPoint:
2222
Version,
2323
RdfFromJelly,
2424
RdfToJelly,
25+
RdfTranscode,
2526
RdfInspect,
2627
RdfValidate,
2728
)

src/main/scala/eu/neverblink/jelly/cli/Exceptions.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ case class ExitException(
3636
cause: Option[Throwable] = None,
3737
) extends CriticalException(
3838
s"Exiting with code $code." + cause.map(e => s" Cause: ${e.getMessage}").getOrElse(""),
39-
)
39+
):
40+
override def getCause: Throwable = cause.getOrElse(this)
4041

4142
class CriticalException(message: String) extends Exception(message)

src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ case class RdfFromJellyOptions(
3838
@ExtraName("out-format") outputFormat: Option[String] = None,
3939
) extends HasJellyCommandOptions
4040

41-
object RdfFromJelly extends RdfTranscodeCommand[RdfFromJellyOptions, RdfFormat.Writeable]:
41+
object RdfFromJelly extends RdfSerDesCommand[RdfFromJellyOptions, RdfFormat.Writeable]:
4242

4343
override def names: List[List[String]] = List(
4444
List("rdf", "from-jelly"),

src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfTranscodeCommand.scala renamed to src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfSerDesCommand.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ import java.io.{InputStream, OutputStream}
1313

1414
/** This abstract class is responsible for the common logic in both RDF parsing commands
1515
*/
16-
abstract class RdfTranscodeCommand[T <: HasJellyCommandOptions: {Parser, Help}, F <: RdfFormat](
17-
using tt: TypeTest[RdfFormat, F],
16+
abstract class RdfSerDesCommand[T <: HasJellyCommandOptions: {Parser, Help}, F <: RdfFormat](using
17+
tt: TypeTest[RdfFormat, F],
1818
) extends JellyCommand[T]:
1919

2020
override final def group = "rdf"

src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ case class RdfToJellyOptions(
5555
delimited: Boolean = true,
5656
) extends HasJellyCommandOptions
5757

58-
object RdfToJelly extends RdfTranscodeCommand[RdfToJellyOptions, RdfFormat.Readable]:
58+
object RdfToJelly extends RdfSerDesCommand[RdfToJellyOptions, RdfFormat.Readable]:
5959

6060
override def names: List[List[String]] = List(
6161
List("rdf", "to-jelly"),
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
package eu.neverblink.jelly.cli.command.rdf
2+
3+
import caseapp.*
4+
import eu.neverblink.jelly.cli.*
5+
import eu.neverblink.jelly.cli.command.rdf.util.*
6+
import eu.ostrzyciel.jelly.core.RdfProtoError
7+
import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamOptions
8+
import eu.ostrzyciel.jelly.core.{JellyOptions, ProtoTranscoder}
9+
10+
import java.io.{InputStream, OutputStream}
11+
12+
@HelpMessage(
13+
"Quickly transcodes the input Jelly file into another Jelly file.\n" +
14+
"If no input file is specified, the input is read from stdin.\n" +
15+
"The input may be a concatenation of multiple Jelly streams.\n" +
16+
"Currently only frame-by-frame transcoding is supported.\n" +
17+
"The output's options must be greater than or equal to the input options.\n" +
18+
"Note: this command works in a streaming manner and scales well to large files.",
19+
)
20+
@ArgsName("<file-to-transcode>")
21+
case class RdfTranscodeOptions(
22+
@Recurse
23+
common: JellyCommandOptions = JellyCommandOptions(),
24+
@HelpMessage(
25+
"Output file to write the Jelly data to. If not specified, the output is written to stdout.",
26+
)
27+
@ExtraName("to") outputFile: Option[String] = None,
28+
@Recurse
29+
jellySerializationOptions: RdfJellySerializationOptions = RdfJellySerializationOptions(),
30+
// TODO: supported input options
31+
// TODO: make it possible to not only frame-by-frame transcode, but also regroup the rows
32+
// TODO: make it possible to do full transcoding (with Jena parsing)
33+
) extends HasJellyCommandOptions
34+
35+
object RdfTranscode extends JellyCommand[RdfTranscodeOptions]:
36+
override def names: List[List[String]] = List(
37+
List("rdf", "transcode"),
38+
)
39+
40+
override final def group = "rdf"
41+
42+
override def doRun(options: RdfTranscodeOptions, remainingArgs: RemainingArgs): Unit =
43+
val outOpt = options.jellySerializationOptions.asRdfStreamOptions
44+
val (inputStream, outputStream) =
45+
getIoStreamsFromOptions(remainingArgs.remaining.headOption, options.outputFile)
46+
try jellyToJelly(inputStream, outputStream, outOpt)
47+
catch case e: RdfProtoError => throw JellyTranscodingError(e.getMessage)
48+
49+
/** Transcodes the input Jelly stream into another Jelly stream.
50+
* @param inputStream
51+
* input
52+
* @param outputStream
53+
* output
54+
* @param outOpt
55+
* user-defined options for the output
56+
*/
57+
private def jellyToJelly(
58+
inputStream: InputStream,
59+
outputStream: OutputStream,
60+
outOpt: RdfStreamOptions,
61+
): Unit =
62+
val in = JellyUtil.iterateRdfStream(inputStream).buffered
63+
val head = in.head
64+
if head.rows.isEmpty then throw CriticalException("Empty input stream")
65+
if !head.rows.head.row.isOptions then
66+
throw CriticalException("First input row is not an options row")
67+
val inOpt = head.rows.head.row.options
68+
69+
val transcoder = ProtoTranscoder.fastMergingTranscoder(
70+
supportedInputOptions = JellyOptions.defaultSupportedOptions,
71+
outputOptions = outOpt.copy(
72+
// There is no way to specify the physical type with options currently.
73+
// Just use the one from the input.
74+
physicalType = inOpt.physicalType,
75+
logicalType =
76+
if outOpt.logicalType.isUnspecified then inOpt.logicalType else outOpt.logicalType,
77+
),
78+
)
79+
80+
in.map(transcoder.ingestFrame)
81+
.foreach(_.writeDelimitedTo(outputStream))
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
package eu.neverblink.jelly.cli.command.rdf
2+
3+
import eu.neverblink.jelly.cli.{ExitException, JellyTranscodingError}
4+
import eu.neverblink.jelly.cli.command.helpers.TestFixtureHelper
5+
import eu.neverblink.jelly.cli.command.rdf.util.{JellyUtil, RdfJellySerializationOptions}
6+
import eu.ostrzyciel.jelly.core.proto.v1.*
7+
import org.scalatest.matchers.should.Matchers
8+
import org.scalatest.wordspec.AnyWordSpec
9+
10+
import java.io.{ByteArrayInputStream, FileInputStream}
11+
12+
class RdfTranscodeSpec extends AnyWordSpec, Matchers, TestFixtureHelper:
13+
protected val testCardinality: Int = 36
14+
15+
private val defaultOpt = RdfJellySerializationOptions().asRdfStreamOptions
16+
17+
private def checkOutputWithDefaultOptions(b: Array[Byte]): Unit =
18+
val outF = RdfStreamFrame.parseDelimitedFrom(ByteArrayInputStream(b))
19+
outF.get.rows.size should be > 36
20+
val opt = outF.get.rows.head.row.options
21+
opt.physicalType should be(PhysicalStreamType.TRIPLES)
22+
opt.logicalType should be(LogicalStreamType.FLAT_TRIPLES)
23+
opt.maxNameTableSize should be(defaultOpt.maxNameTableSize)
24+
opt.maxPrefixTableSize should be(defaultOpt.maxPrefixTableSize)
25+
opt.maxDatatypeTableSize should be(defaultOpt.maxDatatypeTableSize)
26+
opt.rdfStar should be(defaultOpt.rdfStar)
27+
opt.generalizedStatements should be(defaultOpt.generalizedStatements)
28+
29+
"rdf transcode command" should {
30+
"transcode input file with no additional options" in withFullJellyFile { j =>
31+
RdfTranscode.runTestCommand(List("rdf", "transcode", j))
32+
val outB = RdfTranscode.getOutBytes
33+
checkOutputWithDefaultOptions(outB)
34+
}
35+
36+
"transcode stdin with no additional options" in withFullJellyFile { j =>
37+
val inBytes = FileInputStream(j).readAllBytes()
38+
RdfTranscode.setStdIn(ByteArrayInputStream(inBytes))
39+
RdfTranscode.runTestCommand(List("rdf", "transcode"))
40+
val outB = RdfTranscode.getOutBytes
41+
checkOutputWithDefaultOptions(outB)
42+
}
43+
44+
"transcode input file to output file with no additional options" in withEmptyJellyFile { jOut =>
45+
withFullJellyFile { jIn =>
46+
RdfTranscode.runTestCommand(List("rdf", "transcode", "--to", jOut, jIn))
47+
val outB = FileInputStream(jOut).readAllBytes()
48+
checkOutputWithDefaultOptions(outB)
49+
}
50+
}
51+
52+
"merge 100 input streams" in withFullJellyFile { j =>
53+
val inBytes1 = FileInputStream(j).readAllBytes()
54+
val inBytes = (0 until 100).map(_ => inBytes1).reduce(_ ++ _)
55+
RdfTranscode.setStdIn(ByteArrayInputStream(inBytes))
56+
RdfTranscode.runTestCommand(List("rdf", "transcode"))
57+
val outB = RdfTranscode.getOutBytes
58+
checkOutputWithDefaultOptions(outB)
59+
val outFrames = JellyUtil.iterateRdfStream(ByteArrayInputStream(outB)).toSeq
60+
outFrames.size should be(100)
61+
outFrames.foreach { f =>
62+
f.rows.size should be >= testCardinality
63+
}
64+
}
65+
66+
"transcode input file with changed output options" in withFullJellyFile { j =>
67+
RdfTranscode.runTestCommand(
68+
List(
69+
"rdf",
70+
"transcode",
71+
"--opt.max-prefix-table-size=600",
72+
"--opt.logical-type=GRAPHS",
73+
j,
74+
),
75+
)
76+
val outB = RdfTranscode.getOutBytes
77+
val f = RdfStreamFrame.parseDelimitedFrom(ByteArrayInputStream(outB)).get
78+
f.rows.size should be > testCardinality
79+
val opt = f.rows.head.row.options
80+
opt.maxPrefixTableSize should be(600)
81+
opt.physicalType should be(PhysicalStreamType.TRIPLES)
82+
opt.logicalType should be(LogicalStreamType.GRAPHS)
83+
}
84+
85+
"not allow for output name table size to smaller than the input" in withFullJellyFile { j =>
86+
val e = intercept[ExitException] {
87+
RdfTranscode.runTestCommand(List("rdf", "transcode", "--opt.max-name-table-size=60", j))
88+
}
89+
val cause = e.getCause
90+
cause shouldBe a[JellyTranscodingError]
91+
cause.getMessage should include("Input lookup size cannot be greater")
92+
}
93+
}

0 commit comments

Comments
 (0)