diff --git a/build.sbt b/build.sbt index 37fbd1c..bab41a3 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ resolvers += "Sonatype OSS Snapshots" at "https://s01.oss.sonatype.org/content/repositories/snapshots" lazy val jenaV = "5.3.0" -lazy val jellyV = "2.9.1+8-58db074b-SNAPSHOT" +lazy val jellyV = "2.9.1+10-e92cafe2-SNAPSHOT" addCommandAlias("fixAll", "scalafixAll; scalafmtAll") diff --git a/src/main/scala/eu/neverblink/jelly/cli/App.scala b/src/main/scala/eu/neverblink/jelly/cli/App.scala index 1585481..c7f4651 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/App.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/App.scala @@ -23,4 +23,5 @@ object App extends CommandsEntryPoint: RdfFromJelly, RdfToJelly, RdfInspect, + RdfValidate, ) diff --git a/src/main/scala/eu/neverblink/jelly/cli/JellyCommand.scala b/src/main/scala/eu/neverblink/jelly/cli/JellyCommand.scala index 63a6c2c..b2b02e8 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/JellyCommand.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/JellyCommand.scala @@ -1,7 +1,7 @@ package eu.neverblink.jelly.cli import caseapp.* -import eu.neverblink.jelly.cli.util.IoUtil +import eu.neverblink.jelly.cli.util.io.IoUtil import java.io.* import scala.compiletime.uninitialized @@ -120,8 +120,6 @@ abstract class JellyCommand[T <: HasJellyCommandOptions: {Parser, Help}] extends else System.in final def setStdIn(data: ByteArrayInputStream): Unit = - validateTestMode() - in.reset() in = data final def getOutStream: OutputStream = diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala index aa6db16..7daaa96 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala @@ -1,9 +1,10 @@ package eu.neverblink.jelly.cli.command.rdf + import caseapp.* import eu.neverblink.jelly.cli.* -import eu.neverblink.jelly.cli.command.rdf.RdfFormat.* -import eu.neverblink.jelly.cli.command.rdf.RdfFormat.Jena.* -import eu.neverblink.jelly.cli.util.JellyUtil +import eu.neverblink.jelly.cli.command.rdf.util.* +import eu.neverblink.jelly.cli.command.rdf.util.RdfFormat.* +import eu.neverblink.jelly.cli.command.rdf.util.RdfFormat.Jena.* import eu.ostrzyciel.jelly.convert.jena.riot.JellyLanguage import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamFrame import org.apache.jena.riot.system.StreamRDFWriter @@ -42,9 +43,9 @@ object RdfFromJelly extends RdfTranscodeCommand[RdfFromJellyOptions, RdfFormat.W parseFormatArgs(inputStream, outputStream, options.outputFormat, options.outputFile) override def matchFormatToAction( - option: RdfFormat.Writeable, + format: RdfFormat.Writeable, ): Option[(InputStream, OutputStream) => Unit] = - option match + format match case j: RdfFormat.Jena.Writeable => Some(jellyToLang(j.jenaLang, _, _)) case RdfFormat.JellyText => Some(jellyBinaryToText) diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala index 8065104..76a3ddb 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala @@ -2,8 +2,8 @@ package eu.neverblink.jelly.cli.command.rdf import caseapp.{ExtraName, Recurse} import caseapp.core.RemainingArgs -import eu.neverblink.jelly.cli.util.{FrameInfo, JellyUtil, MetricsPrinter} import eu.neverblink.jelly.cli.* +import eu.neverblink.jelly.cli.command.rdf.util.{FrameInfo, JellyUtil, MetricsPrinter} import eu.ostrzyciel.jelly.core.proto.v1.* import java.io.InputStream diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala index 276c13b..34c404d 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala @@ -1,7 +1,9 @@ package eu.neverblink.jelly.cli.command.rdf + import caseapp.* import eu.neverblink.jelly.cli.* -import eu.neverblink.jelly.cli.command.rdf.RdfFormat.* +import eu.neverblink.jelly.cli.command.rdf.util.* +import eu.neverblink.jelly.cli.command.rdf.util.RdfFormat.* import eu.ostrzyciel.jelly.convert.jena.riot.JellyLanguage import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamFrame import org.apache.jena.riot.system.StreamRDFWriter diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfTranscodeCommand.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfTranscodeCommand.scala index 71a3c42..5581582 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfTranscodeCommand.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfTranscodeCommand.scala @@ -4,6 +4,7 @@ import com.google.protobuf.InvalidProtocolBufferException import org.apache.jena.riot.RiotException import eu.neverblink.jelly.cli.* import caseapp.* +import eu.neverblink.jelly.cli.command.rdf.util.{RdfCommandPrintUtil, RdfFormat} import scala.reflect.TypeTest import eu.ostrzyciel.jelly.core.{RdfProtoDeserializationError, RdfProtoSerializationError} @@ -25,7 +26,7 @@ abstract class RdfTranscodeCommand[T <: HasJellyCommandOptions: {Parser, Help}, lazy val printUtil: RdfCommandPrintUtil[F] /** The method responsible for matching the format to a given action */ - def matchFormatToAction(option: F): Option[(InputStream, OutputStream) => Unit] + def matchFormatToAction(format: F): Option[(InputStream, OutputStream) => Unit] /** This method takes care of proper error handling and takes care of the parameter priorities in * matching the input to a given format conversion @@ -42,7 +43,7 @@ abstract class RdfTranscodeCommand[T <: HasJellyCommandOptions: {Parser, Help}, * @throws JenaRiotException * @throws InvalidJellyFile */ - def parseFormatArgs( + final def parseFormatArgs( inputStream: InputStream, outputStream: OutputStream, format: Option[String], diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfValidate.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfValidate.scala new file mode 100644 index 0000000..735d819 --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfValidate.scala @@ -0,0 +1,216 @@ +package eu.neverblink.jelly.cli.command.rdf + +import caseapp.* +import eu.neverblink.jelly.cli.* +import eu.neverblink.jelly.cli.command.rdf.util.* +import eu.neverblink.jelly.cli.util.args.IndexRange +import eu.neverblink.jelly.cli.util.io.IoUtil +import eu.neverblink.jelly.cli.util.jena.* +import eu.ostrzyciel.jelly.convert.jena.JenaConverterFactory +import eu.ostrzyciel.jelly.core.JellyOptions +import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamFrame +import org.apache.jena.graph.Triple +import org.apache.jena.riot.RDFParser +import org.apache.jena.riot.system.StreamRDFLib +import org.apache.jena.sparql.core.Quad + +import scala.util.Using + +object RdfValidatePrint extends RdfCommandPrintUtil[RdfFormat.Jena]: + override val defaultFormat: RdfFormat = RdfFormat.NQuads + +@HelpMessage( + "Validates a Jelly-RDF stream.\nIf no additional options are specified, " + + "only basic validations are performed. You can also validate the stream against " + + "a reference RDF file, check the stream options, and its delimiting.\n" + + "If an error is detected, the program will exit with a non-zero code.\n" + + "Otherwise, the program will exit with code 0.\n" + + "Note: this command does not work in a streaming manner. If you try to validate a very large " + + "file, you may run out of memory.", +) +@ArgsName("") +case class RdfValidateOptions( + @Recurse + common: JellyCommandOptions = JellyCommandOptions(), + @HelpMessage( + "RDF file to compare the input stream to. If not specified, no comparison is done.", + ) + compareToRdfFile: Option[String] = None, + @HelpMessage( + "Format of the RDF file to compare the input stream to. If not specified, the format is " + + "inferred from the file name.", + ) + compareToFormat: Option[String] = None, + @HelpMessage( + "Whether the comparison should be ordered (statements must come in a specific order) or " + + "unordered (RDF dataset isomorphism). Default: false (unordered)", + ) + compareOrdered: Boolean = false, + @HelpMessage( + "Frame indices to compare. If not specified, all frames are compared. " + + "The indices are 0-based and can be specified as a Rust-style range: " + + "'..3', '3..', '1..5', '4..=6'", + ) + compareFrameIndices: String = "", + @HelpMessage( + "File with the expected stream options. If not specified, the options are not checked.", + ) + optionsFile: Option[String] = None, + @HelpMessage( + "Whether the input stream should be checked to be delimited or undelimited. " + + "Possible values: 'either', 'true', 'false'. Default: 'either'.", + ) + delimited: String = "either", +) extends HasJellyCommandOptions + +object RdfValidate extends JellyCommand[RdfValidateOptions]: + private enum Delimiting: + case Either, Delimited, Undelimited + + override def names: List[List[String]] = List(List("rdf", "validate")) + + override def group = "rdf" + + override def doRun(options: RdfValidateOptions, remainingArgs: RemainingArgs): Unit = + // Parse input options + val frameIndices = IndexRange(options.compareFrameIndices, "--compare-frame-indices") + val delimiting = options.delimited match + case "" | "either" => Delimiting.Either + case "true" => Delimiting.Delimited + case "false" => Delimiting.Undelimited + case _ => + throw InvalidArgument( + "--delimited", + options.delimited, + Some("Valid values: true, false, either"), + ) + val rdfComparison = + options.compareToRdfFile.map(n => getRdfForComparison(n, options.compareToFormat)) + val (inputStream, _) = getIoStreamsFromOptions(remainingArgs.remaining.headOption, None) + val (delimited, frameIterator) = JellyUtil.iterateRdfStreamWithDelimitingInfo(inputStream) + + // Step 1: Validate delimiting + validateDelimiting(delimiting, delimited) + // Step 2: Validate basic stream structure & the stream options + val framesSeq = frameIterator.toSeq + validateOptions(framesSeq) + // Step 3: Validate the content + validateContent(framesSeq, frameIndices, rdfComparison) + + private def validateDelimiting( + expected: Delimiting, + delimited: Boolean, + ): Unit = expected match + case Delimiting.Either => () + case Delimiting.Delimited => + if !delimited then + throw CriticalException("Expected delimited input, but the file was not delimited") + case Delimiting.Undelimited => + if delimited then + throw CriticalException("Expected undelimited input, but the file was delimited") + + private def validateOptions(frames: Seq[RdfStreamFrame]): Unit = + // Validate basic stream structure + if frames.isEmpty then throw CriticalException("Empty input stream") + if frames.head.rows.isEmpty then + throw CriticalException("First frame in the input stream is empty") + if !frames.head.rows.head.row.isOptions then + throw CriticalException("First row in the input stream does not contain stream options") + val streamOptions = frames.head.rows.head.row.options + // If we have expected options, we need to read and validate them + val expectedOptions = getOptions.optionsFile.map { optionsFileName => + val o = Using.resource(IoUtil.inputStream(optionsFileName)) { is => + JellyUtil.iterateRdfStream(is).next().rows.head.row.options + } + if streamOptions != o then + throw CriticalException( + s"Stream options do not match the expected options in $optionsFileName\n" + + s"Expected: $o\n" + + s"Actual: $streamOptions", + ) + o + } + JellyOptions.checkCompatibility( + streamOptions, + expectedOptions.getOrElse(JellyOptions.defaultSupportedOptions), + ) + + private def validateContent( + frames: Seq[RdfStreamFrame], + frameIndices: IndexRange, + maybeRdfComparison: Option[StreamRdfCollector], + ): Unit = + // Prepare data structures + val jellyStreamConsumer = + if maybeRdfComparison.isDefined then StreamRdfCollector() + else StreamRDFLib.sinkNull() + val opt = frames.head.rows.head.row.options + val dec = JenaConverterFactory.anyStatementDecoder( + None, + (prefix, iri) => jellyStreamConsumer.prefix(prefix, iri.getURI), + ) + val x = frameIndices.slice(frames).zipWithIndex + for (frame, i) <- x do + val frameIndex = frameIndices.start.getOrElse(0) + i + for row <- frame.rows do + if row.row.isOptions && row.row.options != opt then + throw CriticalException( + s"Later occurrence of stream options in frame $frameIndex does not match the first", + ) + // Push the stream frames through the decoder + // This will catch most of the errors + dec.ingestRowFlat(row) match + case null => () + // Check if the stream really does not contain any RDF-star or generalized statements + // if it doesn't declare to use them. This is normally not checked by the decoder + // because it's too performance-costly. + case t: Triple => + if !opt.generalizedStatements && StatementUtils.isGeneralized(t) then + throw CriticalException(s"Unexpected generalized triple in frame $frameIndex: $t") + if !opt.rdfStar && StatementUtils.isRdfStar(t) then + throw CriticalException(s"Unexpected RDF-star triple in frame $frameIndex: $t") + jellyStreamConsumer.triple(t) + case q: Quad => + if !opt.generalizedStatements && StatementUtils.isGeneralized(q) then + throw CriticalException(s"Unexpected generalized quad in frame $frameIndex: $q") + if !opt.rdfStar && StatementUtils.isRdfStar(q) then + throw CriticalException(s"Unexpected RDF-star quad in frame $frameIndex: $q") + jellyStreamConsumer.quad(q) + // Compare the Jelly data with the reference RDF data, if specified + maybeRdfComparison.foreach { rdfComparison => + val actual = jellyStreamConsumer.asInstanceOf[StreamRdfCollector] + val comparator = + if getOptions.compareOrdered then OrderedRdfCompare + else UnorderedRdfCompare + comparator.compare(rdfComparison, actual) + } + + /** Reads the RDF file for comparison and returns a StreamRdfCollector + * @param fileName + * filename to read + * @param formatName + * optional format name + * @return + */ + private def getRdfForComparison( + fileName: String, + formatName: Option[String], + ): StreamRdfCollector = + val explicitFormat = formatName.flatMap(RdfFormat.find) + val implicitFormat = RdfFormat.inferFormat(fileName) + val format = (explicitFormat, implicitFormat) match { + case (Some(f: RdfFormat.Jena), _) => f + case (_, Some(f: RdfFormat.Jena)) => f + case (_, _) => + throw InvalidFormatSpecified( + formatName.getOrElse(""), + RdfValidatePrint.validFormatsString, + ) + } + val output = StreamRdfCollector() + Using.resource(IoUtil.inputStream(fileName)) { is => + RDFParser.source(is) + .lang(format.jenaLang) + .parse(output) + } + output diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/JellyUtil.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/JellyUtil.scala new file mode 100644 index 0000000..ad1ad7c --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/JellyUtil.scala @@ -0,0 +1,39 @@ +package eu.neverblink.jelly.cli.command.rdf.util + +import eu.ostrzyciel.jelly.core.IoUtils +import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamFrame + +import java.io.InputStream + +object JellyUtil: + /** Reads the Jelly file and returns an iterator of RdfStreamFrame + * + * @param inputStream + * @param outputStream + * @return + */ + def iterateRdfStream( + inputStream: InputStream, + ): Iterator[RdfStreamFrame] = iterateRdfStreamWithDelimitingInfo(inputStream)._2 + + /** Reads the Jelly file and returns an iterator of RdfStreamFrame and a boolean indicating if the + * file is delimited or not + * @param inputStream + * @return + */ + def iterateRdfStreamWithDelimitingInfo( + inputStream: InputStream, + ): (Boolean, Iterator[RdfStreamFrame]) = + IoUtils.autodetectDelimiting(inputStream) match + case (false, newIn) => + // Non-delimited Jelly file + // In this case, we can only read one frame + (false, Iterator(RdfStreamFrame.parseFrom(newIn))) + case (true, newIn) => + // Delimited Jelly file + // In this case, we can read multiple frames + ( + true, + Iterator.continually(RdfStreamFrame.parseDelimitedFrom(newIn)) + .takeWhile(_.isDefined).map(_.get), + ) diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/MetricsPrinter.scala similarity index 96% rename from src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala rename to src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/MetricsPrinter.scala index 82f0d9d..60e2d94 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/MetricsPrinter.scala @@ -1,6 +1,7 @@ -package eu.neverblink.jelly.cli.util +package eu.neverblink.jelly.cli.command.rdf.util -import eu.neverblink.jelly.cli.util.YamlDocBuilder.* +import eu.neverblink.jelly.cli.util.io.YamlDocBuilder.* +import eu.neverblink.jelly.cli.util.io.YamlDocBuilder import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamOptions import java.io.OutputStream diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfCommandPrintUtil.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfCommandPrintUtil.scala similarity index 91% rename from src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfCommandPrintUtil.scala rename to src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfCommandPrintUtil.scala index 9d0174c..f9f6b66 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfCommandPrintUtil.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfCommandPrintUtil.scala @@ -1,4 +1,4 @@ -package eu.neverblink.jelly.cli.command.rdf +package eu.neverblink.jelly.cli.command.rdf.util import scala.reflect.TypeTest diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFormat.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfFormat.scala similarity index 98% rename from src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFormat.scala rename to src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfFormat.scala index 3abddc0..3046a92 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFormat.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfFormat.scala @@ -1,4 +1,4 @@ -package eu.neverblink.jelly.cli.command.rdf +package eu.neverblink.jelly.cli.command.rdf.util import eu.ostrzyciel.jelly.convert.jena.riot.JellyLanguage import org.apache.jena.riot.{Lang, RDFLanguages} diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfJellySerializationOptions.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfJellySerializationOptions.scala similarity index 98% rename from src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfJellySerializationOptions.scala rename to src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfJellySerializationOptions.scala index 0160b54..68ca9df 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfJellySerializationOptions.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/util/RdfJellySerializationOptions.scala @@ -1,9 +1,9 @@ -package eu.neverblink.jelly.cli.command.rdf +package eu.neverblink.jelly.cli.command.rdf.util import caseapp.* import eu.neverblink.jelly.cli.InvalidArgument -import eu.ostrzyciel.jelly.core.{JellyOptions, LogicalStreamTypeFactory} import eu.ostrzyciel.jelly.core.proto.v1.{LogicalStreamType, RdfStreamOptions} +import eu.ostrzyciel.jelly.core.{JellyOptions, LogicalStreamTypeFactory} /** Options for serializing in Jelly-RDF */ case class RdfJellySerializationOptions( diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala b/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala deleted file mode 100644 index 5924b84..0000000 --- a/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala +++ /dev/null @@ -1,27 +0,0 @@ -package eu.neverblink.jelly.cli.util - -import eu.ostrzyciel.jelly.core.IoUtils -import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamFrame - -import java.io.InputStream - -object JellyUtil: - /** This method reads the Jelly file and returns an iterator of RdfStreamFrame - * - * @param inputStream - * @param outputStream - * @return - */ - def iterateRdfStream( - inputStream: InputStream, - ): Iterator[RdfStreamFrame] = - IoUtils.autodetectDelimiting(inputStream) match - case (false, newIn) => - // Non-delimited Jelly file - // In this case, we can only read one frame - Iterator(RdfStreamFrame.parseFrom(newIn)) - case (true, newIn) => - // Delimited Jelly file - // In this case, we can read multiple frames - Iterator.continually(RdfStreamFrame.parseDelimitedFrom(newIn)) - .takeWhile(_.isDefined).map(_.get) diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/args/IndexRange.scala b/src/main/scala/eu/neverblink/jelly/cli/util/args/IndexRange.scala new file mode 100644 index 0000000..a841ab2 --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/args/IndexRange.scala @@ -0,0 +1,48 @@ +package eu.neverblink.jelly.cli.util.args + +import eu.neverblink.jelly.cli.InvalidArgument + +import scala.collection.IterableOnceOps + +final case class IndexRange( + start: Option[Int], + end: Option[Int], +): + def slice[T, C <: IterableOnceOps[T, ?, C]](it: C): C = + val startIndex = start.getOrElse(0) + val endIndex = end.getOrElse(it.size) + it.slice(startIndex, endIndex) + +/** Parser for Rust-style index ranges. + */ +object IndexRange: + def apply(range: String): IndexRange = + apply(range, "--range") + + def apply(range: String, argumentName: String): IndexRange = try { + range.trim match { + case "" => IndexRange(None, None) + case s if s.contains("..") => + val ix = s.indexOf("..") + val before = s.substring(0, ix) + val after = s.substring(ix + 2) + val start = if before.isEmpty then None else Some(before.toInt) + val end = + if after.startsWith("=") then + if after.length == 1 then None else Some(after.substring(1).toInt + 1) + else if after.isEmpty then None + else Some(after.toInt) + IndexRange(start, end) + case s if s.toIntOption.isDefined => IndexRange(Some(s.toInt), Some(s.toInt + 1)) + case _ => throw new IllegalArgumentException(s"Invalid range format: $range") + } + } catch + case e: Throwable => + throw InvalidArgument( + argumentName, + range, + Some( + "Correct ranges are in the form '3' (one index), '..3' (up to exclusive), " + + "'3..' (from inclusive), or '1..3' (range up to exclusive), or '1..=3' (inclusive)", + ), + ) diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/IoUtil.scala b/src/main/scala/eu/neverblink/jelly/cli/util/io/IoUtil.scala similarity index 96% rename from src/main/scala/eu/neverblink/jelly/cli/util/IoUtil.scala rename to src/main/scala/eu/neverblink/jelly/cli/util/io/IoUtil.scala index 6c675af..4ffe65b 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/IoUtil.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/io/IoUtil.scala @@ -1,4 +1,4 @@ -package eu.neverblink.jelly.cli.util +package eu.neverblink.jelly.cli.util.io import eu.neverblink.jelly.cli.* diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala b/src/main/scala/eu/neverblink/jelly/cli/util/io/YamlDocBuilder.scala similarity index 98% rename from src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala rename to src/main/scala/eu/neverblink/jelly/cli/util/io/YamlDocBuilder.scala index 0f7f657..74979b2 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/io/YamlDocBuilder.scala @@ -1,4 +1,4 @@ -package eu.neverblink.jelly.cli.util +package eu.neverblink.jelly.cli.util.io import scala.collection.mutable diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/jena/OrderedRdfCompare.scala b/src/main/scala/eu/neverblink/jelly/cli/util/jena/OrderedRdfCompare.scala new file mode 100644 index 0000000..62a18fa --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/jena/OrderedRdfCompare.scala @@ -0,0 +1,57 @@ +package eu.neverblink.jelly.cli.util.jena + +import eu.neverblink.jelly.cli.CriticalException +import eu.ostrzyciel.jelly.core.NamespaceDeclaration +import org.apache.jena.graph.{Node, Triple} +import org.apache.jena.sparql.core.Quad + +import scala.collection.mutable + +object OrderedRdfCompare extends RdfCompare: + import StatementUtils.* + + def compare( + expected: StreamRdfCollector, + actual: StreamRdfCollector, + ): Unit = + val eSeq = expected.getBuffer + val aSeq = actual.getBuffer + if eSeq.size != aSeq.size then + throw new CriticalException( + s"Expected ${eSeq.size} RDF elements, but got ${aSeq.size} elements.", + ) + val bNodeMap = mutable.Map.empty[String, String] + def tryIsomorphism(e: Seq[Node], a: Seq[Node], i: Int): Unit = + e.zip(a).foreach { (et, at) => + if et.isBlank && at.isBlank then + val eId = et.getBlankNodeLabel + val aId = at.getBlankNodeLabel + if bNodeMap.contains(eId) then + if bNodeMap(eId) != aId then + throw new CriticalException( + s"RDF element $i is different: expected $e, got $a. $eId is " + + s"already mapped to ${bNodeMap(eId)}.", + ) + else bNodeMap(eId) = aId + else if et != at then + throw new CriticalException( + s"RDF element $i is different: expected $e, got $a.", + ) + } + eSeq.zip(aSeq).zipWithIndex.foreach { case ((e, a), i) => + (e, a) match { + case (e: Triple, a: Triple) => + tryIsomorphism(iterateTerms(e), iterateTerms(a), i) + case (e: Quad, a: Quad) => + tryIsomorphism(iterateTerms(e), iterateTerms(a), i) + case (e: NamespaceDeclaration, a: NamespaceDeclaration) => + if e != a then + throw new CriticalException( + s"RDF element $i is different: expected $e, got $a.", + ) + case _ => + throw new CriticalException( + s"RDF element $i is of different type: expected ${e.getClass}, got ${a.getClass}.", + ) + } + } diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/jena/RdfCompare.scala b/src/main/scala/eu/neverblink/jelly/cli/util/jena/RdfCompare.scala new file mode 100644 index 0000000..e51211d --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/jena/RdfCompare.scala @@ -0,0 +1,7 @@ +package eu.neverblink.jelly.cli.util.jena + +trait RdfCompare: + def compare( + expected: StreamRdfCollector, + actual: StreamRdfCollector, + ): Unit diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/jena/RdfElement.scala b/src/main/scala/eu/neverblink/jelly/cli/util/jena/RdfElement.scala new file mode 100644 index 0000000..6960b2b --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/jena/RdfElement.scala @@ -0,0 +1,7 @@ +package eu.neverblink.jelly.cli.util.jena + +import eu.ostrzyciel.jelly.core.NamespaceDeclaration +import org.apache.jena.graph.Triple +import org.apache.jena.sparql.core.Quad + +type RdfElement = Triple | Quad | NamespaceDeclaration diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/jena/StatementUtils.scala b/src/main/scala/eu/neverblink/jelly/cli/util/jena/StatementUtils.scala new file mode 100644 index 0000000..c571da4 --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/jena/StatementUtils.scala @@ -0,0 +1,23 @@ +package eu.neverblink.jelly.cli.util.jena + +import org.apache.jena.graph.{Node, Triple} +import org.apache.jena.sparql.core.Quad + +object StatementUtils: + + def iterateTerms(t: Triple): Seq[Node] = + t.getSubject :: t.getPredicate :: t.getObject :: Nil + + def iterateTerms(q: Quad): Seq[Node] = + q.getSubject :: q.getPredicate :: q.getObject :: q.getGraph :: Nil + + def isGeneralized(t: Triple): Boolean = + (!t.getSubject.isBlank && !t.getSubject.isURI) || !t.getPredicate.isURI + + def isGeneralized(q: Quad): Boolean = + (!q.getSubject.isBlank && !q.getSubject.isURI) || !q.getPredicate.isURI || + (!q.getGraph.isBlank && !q.getGraph.isURI) + + def isRdfStar(t: Triple): Boolean = iterateTerms(t).exists(_.isNodeTriple) + + def isRdfStar(q: Quad): Boolean = iterateTerms(q).exists(_.isNodeTriple) diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/jena/StreamRdfCollector.scala b/src/main/scala/eu/neverblink/jelly/cli/util/jena/StreamRdfCollector.scala new file mode 100644 index 0000000..9ef287e --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/jena/StreamRdfCollector.scala @@ -0,0 +1,38 @@ +package eu.neverblink.jelly.cli.util.jena + +import eu.ostrzyciel.jelly.core.NamespaceDeclaration +import org.apache.jena.graph.Triple +import org.apache.jena.riot.system.StreamRDF +import org.apache.jena.sparql.core.Quad + +import scala.collection.mutable + +/** A StreamRDF implementation that collects everything incoming into a single collection. This is + * not meant to be very scalable or performant. + */ +final class StreamRdfCollector extends StreamRDF: + private val buffer = mutable.ArrayBuffer.empty[RdfElement] + + def getBuffer: Seq[RdfElement] = buffer.toSeq + + def replay(to: StreamRDF): Unit = + getBuffer.foreach { + case t: Triple => to.triple(t) + case q: Quad => to.quad(q) + case ns: NamespaceDeclaration => to.prefix(ns.prefix, ns.iri) + } + + override def start(): Unit = () + + override def triple(triple: Triple): Unit = + buffer += triple + + override def quad(quad: Quad): Unit = + buffer += quad + + override def base(base: String): Unit = () + + override def prefix(prefix: String, iri: String): Unit = + buffer += NamespaceDeclaration(prefix, iri) + + override def finish(): Unit = () diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/jena/UnorderedRdfCompare.scala b/src/main/scala/eu/neverblink/jelly/cli/util/jena/UnorderedRdfCompare.scala new file mode 100644 index 0000000..8b811b7 --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/jena/UnorderedRdfCompare.scala @@ -0,0 +1,34 @@ +package eu.neverblink.jelly.cli.util.jena + +import eu.neverblink.jelly.cli.CriticalException +import org.apache.jena.riot.system.StreamRDFLib +import org.apache.jena.sparql.core.DatasetGraphFactory + +import scala.jdk.CollectionConverters.* + +object UnorderedRdfCompare extends RdfCompare: + def compare( + expected: StreamRdfCollector, + actual: StreamRdfCollector, + ): Unit = + val eDataset = DatasetGraphFactory.create() + val aDataset = DatasetGraphFactory.create() + expected.replay(StreamRDFLib.dataset(eDataset)) + actual.replay(StreamRDFLib.dataset(aDataset)) + if eDataset.size() != aDataset.size() then + throw new CriticalException( + s"Expected ${eDataset.size()} named graph(s), but got ${aDataset.size()}.", + ) + if !eDataset.getDefaultGraph.isIsomorphicWith(aDataset.getDefaultGraph) then + throw new CriticalException( + "Default graph is not isomorphic with the expected one.", + ) + for name <- eDataset.listGraphNodes().asScala do + if !aDataset.containsGraph(name) then + throw new CriticalException( + s"Named graph $name is missing in the actual dataset.", + ) + if !eDataset.getGraph(name).isIsomorphicWith(aDataset.getGraph(name)) then + throw new CriticalException( + s"Named graph $name is not isomorphic with the expected one.", + ) diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJellySpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJellySpec.scala index 0bec3b4..1d3fcce 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJellySpec.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJellySpec.scala @@ -3,6 +3,7 @@ package eu.neverblink.jelly.cli.command.rdf import com.google.protobuf.InvalidProtocolBufferException import eu.neverblink.jelly.cli.* import eu.neverblink.jelly.cli.command.helpers.* +import eu.neverblink.jelly.cli.command.rdf.util.RdfFormat import org.apache.jena.riot.RDFLanguages import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala index c6ac1c3..21de0b8 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala @@ -1,6 +1,7 @@ package eu.neverblink.jelly.cli.command.rdf import eu.neverblink.jelly.cli.command.helpers.{DataGenHelper, TestFixtureHelper} +import eu.neverblink.jelly.cli.command.rdf.util.RdfFormat import eu.neverblink.jelly.cli.{ExitException, InvalidArgument, InvalidFormatSpecified} import eu.ostrzyciel.jelly.convert.jena.riot.JellyLanguage import eu.ostrzyciel.jelly.core.proto.v1.{LogicalStreamType, RdfStreamFrame} diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfValidateSpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfValidateSpec.scala new file mode 100644 index 0000000..1c74b75 --- /dev/null +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfValidateSpec.scala @@ -0,0 +1,498 @@ +package eu.neverblink.jelly.cli.command.rdf + +import eu.neverblink.jelly.cli.{CriticalException, ExitException} +import eu.neverblink.jelly.cli.command.helpers.TestFixtureHelper +import eu.ostrzyciel.jelly.core.RdfProtoDeserializationError +import eu.ostrzyciel.jelly.core.JellyOptions +import eu.ostrzyciel.jelly.core.proto.v1.* +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, FileInputStream} +import scala.util.Using + +class RdfValidateSpec extends AnyWordSpec, Matchers, TestFixtureHelper: + protected val testCardinality: Int = 37 + + "rdf validate command" should { + "complain about empty input" in { + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate")) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include("First frame in the input stream is empty") + } + + "validate delimiting" when { + val frame = RdfStreamFrame( + Seq( + RdfStreamRow( + RdfStreamOptions( + physicalType = PhysicalStreamType.QUADS, + maxNameTableSize = 100, + maxPrefixTableSize = 100, + maxDatatypeTableSize = 100, + logicalType = LogicalStreamType.DATASETS, + version = 1, + ), + ), + ), + ) + val bDelimited = { + val os = ByteArrayOutputStream() + frame.writeDelimitedTo(os) + os.toByteArray + } + val bUndelimited = { + val os = ByteArrayOutputStream() + frame.writeTo(os) + os.toByteArray + } + + "no argument specified, input delimited" in { + RdfValidate.setStdIn(ByteArrayInputStream(bDelimited)) + RdfValidate.runTestCommand(List("rdf", "validate")) + } + + "no argument specified, input undelimited" in { + RdfValidate.setStdIn(ByteArrayInputStream(bUndelimited)) + RdfValidate.runTestCommand(List("rdf", "validate")) + } + + "--delimited=true, input delimited" in { + RdfValidate.setStdIn(ByteArrayInputStream(bDelimited)) + RdfValidate.runTestCommand(List("rdf", "validate", "--delimited=true")) + } + + "--delimited=true, input undelimited" in { + RdfValidate.setStdIn(ByteArrayInputStream(bUndelimited)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate", "--delimited=true")) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include( + "Expected delimited input, but the file was not delimited", + ) + } + + "--delimited=false, input delimited" in { + RdfValidate.setStdIn(ByteArrayInputStream(bDelimited)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate", "--delimited=false")) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include( + "Expected undelimited input, but the file was delimited", + ) + } + + "--delimited=false, input undelimited" in { + RdfValidate.setStdIn(ByteArrayInputStream(bUndelimited)) + RdfValidate.runTestCommand(List("rdf", "validate", "--delimited=false")) + } + + "invalid argument passed to --delimited" in { + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate", "--delimited=invalid")) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include("--delimited") + e.cause.get.getMessage should include( + "Valid values: true, false, either", + ) + } + } + + "validate basic stream structure" when { + "first row in stream is not options" in { + val f = RdfStreamFrame( + Seq(RdfStreamRow(RdfGraphStart())), + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate")) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include( + "First row in the input stream does not contain stream options", + ) + } + + "triple used in a QUADS stream" in { + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallStrict.withPhysicalType(PhysicalStreamType.QUADS).withVersion(1), + ), + RdfStreamRow(RdfNameEntry(value = "a")), + RdfStreamRow( + RdfTriple( + RdfIri(0, 1), + RdfIri(0, 1), + RdfIri(0, 1), + ), + ), + ), + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate")) + } + e.cause.get shouldBe a[RdfProtoDeserializationError] + e.cause.get.getMessage should include("Unexpected triple row in stream") + } + + val rdfStarTriple = Seq( + RdfStreamRow(RdfNameEntry(value = "a")), + RdfStreamRow( + RdfTriple( + RdfIri(0, 1), + RdfIri(0, 1), + RdfTriple(RdfIri(0, 1), RdfIri(0, 1), RdfIri(0, 1)), + ), + ), + ) + val generalizedTriple = Seq( + RdfStreamRow(RdfNameEntry(value = "a")), + RdfStreamRow( + RdfTriple( + RdfLiteral("aaaa"), + RdfIri(0, 1), + RdfIri(0, 1), + ), + ), + ) + val rdfStarQuad = Seq( + RdfStreamRow(RdfNameEntry(value = "a")), + RdfStreamRow( + RdfQuad( + RdfIri(0, 1), + RdfIri(0, 1), + RdfTriple(RdfIri(0, 1), RdfIri(0, 1), RdfIri(0, 1)), + RdfIri(0, 1), + ), + ), + ) + val generalizedQuad = Seq( + RdfStreamRow(RdfNameEntry(value = "a")), + RdfStreamRow( + RdfQuad( + RdfIri(0, 1), + RdfIri(0, 1), + RdfIri(0, 1), + RdfLiteral("aaaa"), + ), + ), + ) + + "RDF-star triple used in an RDF-star stream" in { + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallRdfStar.withPhysicalType(PhysicalStreamType.TRIPLES).withVersion(1), + ), + ) ++ rdfStarTriple, + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + RdfValidate.runTestCommand(List("rdf", "validate")) + } + + "RDF-star triple used in a non-RDF-star stream" in { + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallStrict.withPhysicalType(PhysicalStreamType.TRIPLES).withVersion(1), + ), + ) ++ rdfStarTriple, + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate")) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include("Unexpected RDF-star triple in frame 0:") + } + + "generalized triple used in a generalized stream" in { + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallGeneralized.withPhysicalType( + PhysicalStreamType.TRIPLES, + ).withVersion(1), + ), + ) ++ generalizedTriple, + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + RdfValidate.runTestCommand(List("rdf", "validate")) + } + + "generalized triple used in a non-generalized stream" in { + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallStrict.withPhysicalType(PhysicalStreamType.TRIPLES).withVersion(1), + ), + ) ++ generalizedTriple, + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate")) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include("Unexpected generalized triple in frame 0:") + } + + "RDF-star quad used in an RDF-star stream" in { + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallRdfStar.withPhysicalType(PhysicalStreamType.QUADS).withVersion(1), + ), + ) ++ rdfStarQuad, + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + RdfValidate.runTestCommand(List("rdf", "validate")) + } + + "RDF-star quad used in a non-RDF-star stream" in { + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallStrict.withPhysicalType(PhysicalStreamType.QUADS).withVersion(1), + ), + ) ++ rdfStarQuad, + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate")) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include("Unexpected RDF-star quad in frame 0:") + } + + "generalized quad used in a generalized stream" in { + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallGeneralized.withPhysicalType( + PhysicalStreamType.QUADS, + ).withVersion(1), + ), + ) ++ generalizedQuad, + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + RdfValidate.runTestCommand(List("rdf", "validate")) + } + + "generalized quad used in a non-generalized stream" in { + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallStrict.withPhysicalType(PhysicalStreamType.QUADS).withVersion(1), + ), + ) ++ generalizedQuad, + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate")) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include("Unexpected generalized quad in frame 0:") + } + + "repeated stream options (matching)" in { + val o = JellyOptions.smallStrict.withPhysicalType(PhysicalStreamType.QUADS).withVersion(1) + val f = RdfStreamFrame( + Seq( + RdfStreamRow(o), + RdfStreamRow(o), + ), + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + RdfValidate.runTestCommand(List("rdf", "validate")) + } + + "repeated stream options (differing)" in { + val o = JellyOptions.smallStrict.withPhysicalType(PhysicalStreamType.QUADS).withVersion(1) + val f = RdfStreamFrame( + Seq( + RdfStreamRow(o), + RdfStreamRow(o.withVersion(2)), + ), + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate")) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include( + "Later occurrence of stream options in frame 0 does not match the first", + ) + } + } + + "validate options" when { + "invalid input options supplied, no validation source" in { + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallStrict, + ), + ), + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate")) + } + e.cause.get shouldBe a[RdfProtoDeserializationError] + e.cause.get.getMessage should include("Incoming physical stream type is not set") + } + + "same input options supplied as in the validation source" in withFullJellyFile { j => + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.smallAllFeatures + .withPhysicalType(PhysicalStreamType.TRIPLES) + .withLogicalType(LogicalStreamType.FLAT_TRIPLES) + .withVersion(1), + ), + ), + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + RdfValidate.runTestCommand(List("rdf", "validate", "--options-file", j)) + } + + "different input options supplied as in the validation source" in withFullJellyFile { j => + val f = RdfStreamFrame( + Seq( + RdfStreamRow( + JellyOptions.bigStrict + .withPhysicalType(PhysicalStreamType.TRIPLES) + .withVersion(1), + ), + ), + ) + RdfValidate.setStdIn(ByteArrayInputStream(f.toByteArray)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand(List("rdf", "validate", "--options-file", j)) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include("Stream options do not match the expected options") + } + } + + "validate content" when { + "content matches the reference RDF file" in withFullJenaFile { jenaF => + withFullJellyFile { jellyF => + RdfValidate.runTestCommand( + List( + "rdf", + "validate", + "--compare-to-rdf-file=" + jenaF, + jellyF, + ), + ) + } + } + + "content matches the reference RDF file, ordered comparison" in withFullJenaFile { jenaF => + withFullJellyFile { jellyF => + RdfValidate.runTestCommand( + List( + "rdf", + "validate", + "--compare-to-rdf-file=" + jenaF, + "--compare-to-format=nt", + "--compare-ordered=true", + jellyF, + ), + ) + } + } + + "content matches the reference RDF file, using a slice of the stream" in withFullJenaFile { + jenaF => + withFullJellyFile { jellyF => + val frame1 = Using.resource(FileInputStream(jellyF)) { is => + RdfStreamFrame.parseDelimitedFrom(is).get + } + val frames = frame1 +: (1 to 10).map { i => + RdfStreamFrame( + Seq(RdfStreamRow(RdfTriple(RdfIri(0, i), RdfIri(0, i), RdfLiteral("aaaa")))), + ) + } + val b = { + val os = ByteArrayOutputStream() + frames.foreach(_.writeDelimitedTo(os)) + os.toByteArray + } + RdfValidate.setStdIn(ByteArrayInputStream(b)) + RdfValidate.runTestCommand( + List( + "rdf", + "validate", + "--compare-to-rdf-file=" + jenaF, + "--compare-to-format=nt", + "--compare-ordered=true", + "--compare-frame-indices=0", + ), + ) + } + } + + "content does not match the reference RDF file, using slices" in withFullJenaFile { jenaF => + withFullJellyFile { jellyF => + val frame1 = Using.resource(FileInputStream(jellyF)) { is => + RdfStreamFrame.parseDelimitedFrom(is).get + } + val frames = frame1 +: (1 to 10).map { i => + RdfStreamFrame( + Seq(RdfStreamRow(RdfTriple(RdfIri(0, i), RdfIri(0, i), RdfLiteral("aaaa")))), + ) + } + val b = { + val os = ByteArrayOutputStream() + frames.foreach(_.writeDelimitedTo(os)) + os.toByteArray + } + RdfValidate.setStdIn(ByteArrayInputStream(b)) + val e = intercept[ExitException] { + RdfValidate.runTestCommand( + List( + "rdf", + "validate", + "--compare-to-rdf-file=" + jenaF, + "--compare-to-format=nt", + "--compare-ordered=true", + "--compare-frame-indices=0..4", + ), + ) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include( + "Expected 37 RDF elements, but got 40", + ) + } + } + + "RDF file for comparison has an unrecognized format" in { + val e = intercept[ExitException] { + RdfValidate.runTestCommand( + List( + "rdf", + "validate", + "--compare-to-rdf-file=test.txt", + "--compare-to-format=invalid", + "--compare-frame-indices=0..4", + ), + ) + } + e.cause.get shouldBe a[CriticalException] + e.cause.get.getMessage should include( + "Invalid format option: \"invalid\"", + ) + } + } + } diff --git a/src/test/scala/eu/neverblink/jelly/cli/util/args/IndexRangeSpec.scala b/src/test/scala/eu/neverblink/jelly/cli/util/args/IndexRangeSpec.scala new file mode 100644 index 0000000..1e179af --- /dev/null +++ b/src/test/scala/eu/neverblink/jelly/cli/util/args/IndexRangeSpec.scala @@ -0,0 +1,77 @@ +package eu.neverblink.jelly.cli.util.args + +import eu.neverblink.jelly.cli.InvalidArgument +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +class IndexRangeSpec extends AnyWordSpec, Matchers: + "IndexRange" should { + "parse a single index" in { + IndexRange("1") should be(IndexRange(Some(1), Some(2))) + } + + "parse a range" in { + IndexRange("1..3") should be(IndexRange(Some(1), Some(3))) + } + + "parse a range with inclusive end" in { + IndexRange("1..=3") should be(IndexRange(Some(1), Some(4))) + } + + "parse a range without start" in { + IndexRange("..3") should be(IndexRange(None, Some(3))) + } + + "parse a range without end" in { + IndexRange("1..") should be(IndexRange(Some(1), None)) + } + + "parse a range with empty start and end" in { + IndexRange("..") should be(IndexRange(None, None)) + } + + "parse a range with empty start and end with equals" in { + IndexRange("..=") should be(IndexRange(None, None)) + } + + "parse a range with empty start and end with equals and no end" in { + IndexRange("..=3") should be(IndexRange(None, Some(4))) + } + + "parse an empty string" in { + IndexRange("") should be(IndexRange(None, None)) + } + + "not parse an invalid range (a..b)" in { + val e = intercept[InvalidArgument] { + IndexRange("a..b") + } + e.argument should be("--range") + e.argumentValue should be("a..b") + e.message.get should include("Correct ranges are in the form") + } + + "not parse an invalid range (123a)" in { + val e = intercept[InvalidArgument] { + IndexRange("123a") + } + e.argument should be("--range") + e.argumentValue should be("123a") + e.message.get should include("Correct ranges are in the form") + } + + "not parse an invalid range (asdad)" in { + val e = intercept[InvalidArgument] { + IndexRange("asdad") + } + e.argument should be("--range") + e.argumentValue should be("asdad") + e.message.get should include("Correct ranges are in the form") + } + + "slice a collection" in { + val range = IndexRange(Some(1), Some(3)) + val collection = Seq(0, 1, 2, 3, 4) + range.slice(collection) should be(Seq(1, 2)) + } + } diff --git a/src/test/scala/eu/neverblink/jelly/cli/util/jena/RdfCompareSpec.scala b/src/test/scala/eu/neverblink/jelly/cli/util/jena/RdfCompareSpec.scala new file mode 100644 index 0000000..617493c --- /dev/null +++ b/src/test/scala/eu/neverblink/jelly/cli/util/jena/RdfCompareSpec.scala @@ -0,0 +1,216 @@ +package eu.neverblink.jelly.cli.util.jena + +import eu.neverblink.jelly.cli.CriticalException +import eu.ostrzyciel.jelly.core.NamespaceDeclaration +import org.apache.jena.graph.{NodeFactory, Triple} +import org.apache.jena.sparql.core.Quad +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +class RdfCompareSpec extends AnyWordSpec, Matchers: + // Test triples and quads + private val t_iri_1 = Triple.create( + NodeFactory.createURI("http://example.com/s"), + NodeFactory.createURI("http://example.com/p"), + NodeFactory.createURI("http://example.com/o"), + ) + private val t_iri_2 = Triple.create( + NodeFactory.createURI("http://example.com/s2"), + NodeFactory.createURI("http://example.com/p"), + NodeFactory.createURI("http://example.com/o"), + ) + private val t_bnode_1 = Triple.create( + NodeFactory.createBlankNode("b1"), + NodeFactory.createURI("http://example.com/p"), + NodeFactory.createURI("http://example.com/o"), + ) + private val t_bnode_2 = Triple.create( + NodeFactory.createBlankNode("b2"), + NodeFactory.createURI("http://example.com/p"), + NodeFactory.createURI("http://example.com/o"), + ) + private val q_bnode_1 = Quad.create( + NodeFactory.createURI("http://example.com/g"), + NodeFactory.createBlankNode("b1"), + NodeFactory.createURI("http://example.com/p"), + NodeFactory.createURI("http://example.com/o"), + ) + private val q_bnode_2 = Quad.create( + NodeFactory.createURI("http://example.com/g"), + NodeFactory.createBlankNode("b2"), + NodeFactory.createURI("http://example.com/p"), + NodeFactory.createURI("http://example.com/o"), + ) + private val q_bnode_3 = Quad.create( + NodeFactory.createURI("http://example.com/g2"), + NodeFactory.createBlankNode("b1"), + NodeFactory.createURI("http://example.com/p"), + NodeFactory.createURI("http://example.com/o"), + ) + private val ns1 = NamespaceDeclaration("ex", "http://example.com/") + private val ns2 = NamespaceDeclaration("ex2", "http://example.com/") + + val c1: StreamRdfCollector = StreamRdfCollector() + c1.prefix(ns1.prefix, ns1.iri) + c1.triple(t_iri_1) + c1.triple(t_bnode_1) + c1.quad(q_bnode_1) + + val c2: StreamRdfCollector = StreamRdfCollector() + c2.prefix(ns1.prefix, ns1.iri) + c2.triple(t_iri_1) + c2.triple(t_bnode_2) + c2.quad(q_bnode_2) + + "OrderedRdfCompare" should { + "match identical streams (identity)" in { + OrderedRdfCompare.compare(c1, c1) + } + + "match identical streams (same data)" in { + val c1a = StreamRdfCollector() + c1a.prefix(ns1.prefix, ns1.iri) + c1a.triple(t_iri_1) + c1a.triple(t_bnode_1) + c1a.quad(q_bnode_1) + OrderedRdfCompare.compare(c1, c1a) + } + + "match streams with differing blank node IDs" in { + OrderedRdfCompare.compare(c1, c2) + } + + "not match streams with different namespace declarations" in { + val c3 = StreamRdfCollector() + c3.prefix(ns2.prefix, ns2.iri) + c3.triple(t_iri_1) + c3.triple(t_bnode_1) + c3.quad(q_bnode_1) + val e = intercept[CriticalException] { + OrderedRdfCompare.compare(c1, c3) + } + e.getMessage should include("RDF element 0 is different: expected") + } + + "not match streams with missing namespace declarations" in { + val c3 = StreamRdfCollector() + c3.triple(t_iri_1) + c3.triple(t_bnode_1) + c3.quad(q_bnode_1) + val e = intercept[CriticalException] { + OrderedRdfCompare.compare(c1, c3) + } + e.getMessage should include("Expected 4 RDF elements, but got 3 elements") + } + + "not match streams with reordered namespace declarations" in { + val c3 = StreamRdfCollector() + c3.triple(t_iri_1) + c3.prefix(ns1.prefix, ns1.iri) + c3.triple(t_bnode_1) + c3.quad(q_bnode_1) + val e = intercept[CriticalException] { + OrderedRdfCompare.compare(c1, c3) + } + e.getMessage should include("RDF element 0 is of different type: expected") + } + + "not match streams with conflicting blank node mappings" in { + val c3 = StreamRdfCollector() + c3.prefix(ns1.prefix, ns1.iri) + c3.triple(t_iri_1) + c3.triple(t_bnode_1) + c3.quad(q_bnode_2) + val e = intercept[CriticalException] { + OrderedRdfCompare.compare(c1, c3) + } + e.getMessage should include("RDF element 3 is different: expected") + e.getMessage should include("b1 is already mapped to b1") + } + + "not match streams with different IRIs" in { + val c3 = StreamRdfCollector() + c3.prefix(ns1.prefix, ns1.iri) + c3.triple(t_iri_2) + c3.triple(t_bnode_1) + c3.quad(q_bnode_1) + val e = intercept[CriticalException] { + OrderedRdfCompare.compare(c1, c3) + } + e.getMessage should include("RDF element 1 is different: expected") + } + } + + "UnorderedRdfCompare" should { + "match identical streams (identity)" in { + UnorderedRdfCompare.compare(c1, c1) + } + + "match identical streams (same data)" in { + val c1a = StreamRdfCollector() + c1a.prefix(ns1.prefix, ns1.iri) + c1a.triple(t_iri_1) + c1a.triple(t_bnode_1) + c1a.quad(q_bnode_1) + UnorderedRdfCompare.compare(c1, c1a) + } + + "match streams with differing blank node IDs" in { + UnorderedRdfCompare.compare(c1, c2) + } + + "not match streams with a missing named graph" in { + val c3 = StreamRdfCollector() + c3.prefix(ns1.prefix, ns1.iri) + c3.triple(t_iri_1) + c3.triple(t_bnode_1) + val e = intercept[CriticalException] { + UnorderedRdfCompare.compare(c1, c3) + } + e.getMessage should include( + "Expected 1 named graph(s), but got 0", + ) + } + + "not match streams with different graph names" in { + val c3 = StreamRdfCollector() + c3.prefix(ns1.prefix, ns1.iri) + c3.triple(t_iri_1) + c3.triple(t_bnode_1) + c3.quad(q_bnode_3) + val e = intercept[CriticalException] { + UnorderedRdfCompare.compare(c1, c3) + } + e.getMessage should include( + "Named graph http://example.com/g is missing in the actual dataset", + ) + } + + "not match streams with a different default graph" in { + val c3 = StreamRdfCollector() + c3.prefix(ns1.prefix, ns1.iri) + c3.triple(t_iri_1) + c3.quad(q_bnode_2) + val e = intercept[CriticalException] { + UnorderedRdfCompare.compare(c1, c3) + } + e.getMessage should include( + "Default graph is not isomorphic with the expected one", + ) + } + + "not match streams with different contents of a named graph" in { + val c3 = StreamRdfCollector() + c3.prefix(ns1.prefix, ns1.iri) + c3.triple(t_iri_1) + c3.triple(t_bnode_1) + c3.quad(q_bnode_1) + c3.quad(q_bnode_2) + val e = intercept[CriticalException] { + UnorderedRdfCompare.compare(c1, c3) + } + e.getMessage should include( + "Named graph http://example.com/g is not isomorphic with the expected one", + ) + } + }