From 645f1525a10b575d74ebecab4e16e71a63757586 Mon Sep 17 00:00:00 2001 From: Karolina Bogacka Date: Fri, 4 Apr 2025 11:25:23 +0200 Subject: [PATCH 1/6] Add YAML-compliant MVP --- build.sbt | 1 + .../scala/eu/neverblink/jelly/cli/App.scala | 1 + .../jelly/cli/command/rdf/RdfFromJelly.scala | 24 +---- .../jelly/cli/command/rdf/RdfInspect.scala | 75 +++++++++++++ .../neverblink/jelly/cli/util/JellyUtil.scala | 100 ++++++++++++++++++ .../jelly/cli/util/YamlDocBuilder.scala | 53 ++++++++++ .../command/helpers/TestFixtureHelper.scala | 12 ++- .../cli/command/rdf/RdfInspectSpec.scala | 38 +++++++ 8 files changed, 278 insertions(+), 26 deletions(-) create mode 100644 src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala create mode 100644 src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala create mode 100644 src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala create mode 100644 src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala diff --git a/build.sbt b/build.sbt index 359d8d5..41a52b1 100644 --- a/build.sbt +++ b/build.sbt @@ -38,6 +38,7 @@ lazy val root = (project in file(".")) "eu.ostrzyciel.jelly" %% "jelly-jena" % jellyV, "com.github.alexarchambault" %% "case-app" % "2.1.0-M30", "org.scalatest" %% "scalatest" % "3.2.19" % Test, + "org.yaml" % "snakeyaml" % "2.4" % Test, ), scalacOptions ++= Seq( "-Wunused:imports", diff --git a/src/main/scala/eu/neverblink/jelly/cli/App.scala b/src/main/scala/eu/neverblink/jelly/cli/App.scala index 9b869f6..1585481 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/App.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/App.scala @@ -22,4 +22,5 @@ object App extends CommandsEntryPoint: Version, RdfFromJelly, RdfToJelly, + RdfInspect, ) diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala index b111488..bd72c80 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala @@ -3,9 +3,9 @@ import caseapp.* import eu.neverblink.jelly.cli.* import eu.neverblink.jelly.cli.command.rdf.RdfFormat.* import eu.neverblink.jelly.cli.command.rdf.RdfFormat.Jena.* +import eu.neverblink.jelly.cli.util.JellyUtil import eu.ostrzyciel.jelly.convert.jena.riot.JellyLanguage import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamFrame -import eu.ostrzyciel.jelly.core.IoUtils import org.apache.jena.riot.system.StreamRDFWriter import org.apache.jena.riot.{Lang, RDFParser} @@ -83,30 +83,10 @@ object RdfFromJelly extends RdfCommand[RdfFromJellyOptions, RdfFormat.Writeable] outputStream.write(frame.getBytes) try { - iterateRdfStream(inputStream, outputStream).zipWithIndex.foreach { + JellyUtil.iterateRdfStream(inputStream).zipWithIndex.foreach { case (maybeFrame, frameIndex) => writeFrameToOutput(maybeFrame, frameIndex) } } finally { outputStream.flush() } - - /** This method reads the Jelly file and returns an iterator of RdfStreamFrame - * @param inputStream - * @param outputStream - * @return - */ - private def iterateRdfStream( - inputStream: InputStream, - outputStream: OutputStream, - ): Iterator[RdfStreamFrame] = - IoUtils.autodetectDelimiting(inputStream) match - case (false, newIn) => - // Non-delimited Jelly file - // In this case, we can only read one frame - Iterator(RdfStreamFrame.parseFrom(newIn)) - case (true, newIn) => - // Delimited Jelly file - // In this case, we can read multiple frames - Iterator.continually(RdfStreamFrame.parseDelimitedFrom(newIn)) - .takeWhile(_.isDefined).map(_.get) diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala new file mode 100644 index 0000000..e84dfcb --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala @@ -0,0 +1,75 @@ +package eu.neverblink.jelly.cli.command.rdf + +import caseapp.{ExtraName, Recurse} +import caseapp.core.RemainingArgs +import eu.neverblink.jelly.cli.util.{FrameInfo, JellyUtil, MetricsPrinter} +import eu.neverblink.jelly.cli.{HasJellyCommandOptions, JellyCommand, JellyCommandOptions} +import eu.ostrzyciel.jelly.core.proto.v1.* + +import java.io.InputStream + +case class RdfInspectOptions( + @Recurse + common: JellyCommandOptions = JellyCommandOptions(), + @ExtraName("to") outputFile: Option[String] = None, + @ExtraName("per-frame") perFrame: Boolean = false, +) extends HasJellyCommandOptions + +object RdfInspect extends JellyCommand[RdfInspectOptions]: + + override def names: List[List[String]] = List( + List("rdf", "inspect"), + ) + // from what we've talked about yesterday it also sounded like we should accept some optional parameters + // specifying which frames / parts of the stream to compute the metrics for + // probably when only frameStart specified we should compute it only for this frame then + + override final def group = "rdf" + + override def doRun(options: RdfInspectOptions, remainingArgs: RemainingArgs): Unit = + val (inputStream, outputStream) = + this.getIoStreamsFromOptions(remainingArgs.remaining.headOption, options.outputFile) + val printer = inspectJelly(inputStream) + if options.perFrame then printer.printPerFrame(outputStream) + else printer.printAggregate(outputStream) + + def inspectJelly( + inputStream: InputStream, + ): MetricsPrinter = + val printer = new MetricsPrinter + // Here we can easily compute overall metrics + + inline def computeMetrics(frame: RdfStreamFrame, frameIndex: Int): Unit = + if printer.printOptions.isEmpty then + if frame.rows.nonEmpty && frame.rows.head.row.isOptions then + printer.printOptions = Some(frame.rows.head.row.asInstanceOf[RdfStreamOptions]) + else throw new RuntimeException("First row of the frame is not an options row") + val metrics = new FrameInfo() + frame.rows.foreach(r => metricsForRow(r, metrics)) + printer.frameInfo += metrics + + try { + JellyUtil.iterateRdfStream(inputStream).zipWithIndex.foreach { + case (maybeFrame, frameIndex) => computeMetrics(maybeFrame, frameIndex) + } + printer + } catch { + case e: Exception => + throw new RuntimeException("Error inspecting Jelly file", e) + } + + private def metricsForRow( + row: RdfStreamRow, + metadata: FrameInfo, + ): Unit = + row.row match { + case r: RdfTriple => metadata.tripleCount += 1 + case r: RdfQuad => metadata.quadCount += 1 + case r: RdfNameEntry => metadata.nameCount += 1 + case r: RdfPrefixEntry => metadata.prefixCount += 1 + case r: RdfNamespaceDeclaration => metadata.namespaceCount += 1 + case r: RdfDatatypeEntry => metadata.datatypeCount += 1 + case r: RdfGraphStart => metadata.graphStartCount += 1 + case r: RdfGraphEnd => metadata.graphEndCount += 1 + case r: RdfStreamOptions => metadata.optionCount += 1 + } diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala b/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala new file mode 100644 index 0000000..119aec4 --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala @@ -0,0 +1,100 @@ +package eu.neverblink.jelly.cli.util + +import eu.ostrzyciel.jelly.core.IoUtils +import eu.ostrzyciel.jelly.core.proto.v1.{RdfStreamFrame, RdfStreamOptions} + +import java.io.{InputStream, OutputStream} +import scala.collection.mutable.ListBuffer + +class FrameInfo: + var metadata: Map[String, String] = Map.empty + var optionCount: Int = 0 + var nameCount: Int = 0 + var namespaceCount: Int = 0 + var tripleCount: Int = 0 + var quadCount: Int = 0 + var prefixCount: Int = 0 + var datatypeCount: Int = 0 + var graphStartCount: Int = 0 + var graphEndCount: Int = 0 + +end FrameInfo + +class MetricsPrinter: + + var printOptions: Option[RdfStreamOptions] = None + var frameInfo: ListBuffer[FrameInfo] = ListBuffer.empty + + def printPerFrame(o: OutputStream): Unit = { + val parsedString = YamlDocBuilder.build(YamlDocBuilder.YamlList(frameInfo.map { frame => + YamlDocBuilder.YamlMap( + "optionCount" -> YamlDocBuilder.YamlString(frame.optionCount.toString), + "nameCount" -> YamlDocBuilder.YamlString(frame.nameCount.toString), + "namespaceCount" -> YamlDocBuilder.YamlString(frame.namespaceCount.toString), + "tripleCount" -> YamlDocBuilder.YamlString(frame.tripleCount.toString), + "quadCount" -> YamlDocBuilder.YamlString(frame.quadCount.toString), + "prefixCount" -> YamlDocBuilder.YamlString(frame.prefixCount.toString), + "datatypeCount" -> YamlDocBuilder.YamlString(frame.datatypeCount.toString), + "graphStartCount" -> YamlDocBuilder.YamlString(frame.graphStartCount.toString), + "graphEndCount" -> YamlDocBuilder.YamlString(frame.graphEndCount.toString), + ) + }.toSeq)) + o.write(parsedString.getBytes) + + } + + def printAggregate(o: OutputStream): Unit = { + val sumCounts = frameInfo.reduce((a, b) => { + a.optionCount += b.optionCount + a.nameCount += b.nameCount + a.namespaceCount += b.namespaceCount + a.tripleCount += b.tripleCount + a.quadCount += b.quadCount + a.prefixCount += b.prefixCount + a.datatypeCount += b.datatypeCount + a.graphStartCount += b.graphStartCount + a.graphEndCount += b.graphEndCount + a + }) + val parsedString = YamlDocBuilder.build( + YamlDocBuilder.YamlMap( + "optionCount" -> YamlDocBuilder.YamlString(sumCounts.optionCount.toString), + "nameCount" -> YamlDocBuilder.YamlString(sumCounts.nameCount.toString), + "nameSpaceCount" -> YamlDocBuilder.YamlString(sumCounts.namespaceCount.toString), + "tripleCount" -> YamlDocBuilder.YamlString(sumCounts.tripleCount.toString), + "quadCount" -> YamlDocBuilder.YamlString(sumCounts.quadCount.toString), + "prefixCount" -> YamlDocBuilder.YamlString(sumCounts.prefixCount.toString), + "datatypeCount" -> YamlDocBuilder.YamlString(sumCounts.datatypeCount.toString), + "graphStartCount" -> YamlDocBuilder.YamlString(sumCounts.graphStartCount.toString), + "graphEndCount" -> YamlDocBuilder.YamlString(sumCounts.graphEndCount.toString), + ), + ) + o.write(parsedString.getBytes) + } + +end MetricsPrinter + +object MetricsPrinter: + +end MetricsPrinter + +object JellyUtil: + /** This method reads the Jelly file and returns an iterator of RdfStreamFrame + * + * @param inputStream + * @param outputStream + * @return + */ + def iterateRdfStream( + inputStream: InputStream, + ): Iterator[RdfStreamFrame] = + IoUtils.autodetectDelimiting(inputStream) match + case (false, newIn) => + // Non-delimited Jelly file + // In this case, we can only read one frame + Iterator(RdfStreamFrame.parseFrom(newIn)) + case (true, newIn) => + // Delimited Jelly file + // In this case, we can read multiple frames + Iterator.continually(RdfStreamFrame.parseDelimitedFrom(newIn)) + .takeWhile(_.isDefined).map(_.get) diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala new file mode 100644 index 0000000..bd5aa08 --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala @@ -0,0 +1,53 @@ +package eu.neverblink.jelly.cli.util + +object YamlDocBuilder: + /** A lightweight YAML document builder based on + * https://github.com/RiverBench/ci-worker/blob/main/src%2Fmain%2Fscala%2Futil%2FYamlDocBuilder.scala + */ + + sealed trait YamlValue + case class YamlString(v: String) extends YamlValue + + case class YamlList(v: Seq[YamlValue]) extends YamlValue + + object YamlMap: + def apply(v: (String, YamlValue)*): YamlMap = YamlMap(v.toMap) + def apply(k: String, v: String): YamlMap = YamlMap(Map(k -> YamlString(v))) + def apply(k: String, v: YamlValue): YamlMap = YamlMap(Map(k -> v)) + + case class YamlMap(v: Map[String, YamlValue]) extends YamlValue + + def build(root: YamlValue): String = + val sb = new StringBuilder + build(root, sb, 0) + sb.toString + + private def build(root: YamlValue, sb: StringBuilder, indent: Int): Unit = + root match + case YamlString(v) => + sb.append(quoteAndEscape(v)) + case YamlList(v) => + sb.append(System.lineSeparator()) + v.foreach { e => + sb.append(" " * indent).append("- ") + build(e, sb, indent + 1) + if e != v.last then sb.append(System.lineSeparator()) + } + case YamlMap(v) => + v.zipWithIndex.foreach { case ((k, e), ix) => + if ix != 0 then sb.append(" " * indent) + sb.append(quoteAndEscape(k)) + sb.append(": ") + build(e, sb, indent + 1) + if ix != v.size - 1 then sb.append(System.lineSeparator()) + } + + private def quoteAndEscape(s: String): String = + "\"" + escape(s) + "\"" + + private def escape(s: String): String = + s.replace("\\", "\\\\") + .replace("\"", "\\\"") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t") diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/helpers/TestFixtureHelper.scala b/src/test/scala/eu/neverblink/jelly/cli/command/helpers/TestFixtureHelper.scala index f08de69..1570391 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/helpers/TestFixtureHelper.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/helpers/TestFixtureHelper.scala @@ -1,7 +1,7 @@ package eu.neverblink.jelly.cli.command.helpers -import eu.ostrzyciel.jelly.convert.jena.riot.JellyLanguage -import org.apache.jena.riot.{Lang, RDFDataMgr, RDFLanguages} +import eu.ostrzyciel.jelly.convert.jena.riot.{JellyFormatVariant, JellyLanguage} +import org.apache.jena.riot.{Lang, RDFDataMgr, RDFFormat, RDFLanguages} import org.apache.jena.sys.JenaSystem import org.scalatest.BeforeAndAfterAll import org.scalatest.wordspec.AnyWordSpec @@ -67,11 +67,15 @@ trait TestFixtureHelper extends BeforeAndAfterAll: testCode(tempFile.toString) } finally { tempFile.toFile.delete() } - def withFullJellyFile(testCode: (String) => Any): Unit = + def withFullJellyFile(testCode: (String) => Any, frameSize: Int = 256): Unit = val extension = getFileExtension(JellyLanguage.JELLY) val tempFile = Files.createTempFile(tmpDir, randomUUID.toString, f".${extension}") + val customFormat = new RDFFormat( + JellyLanguage.JELLY, + JellyFormatVariant(frameSize = frameSize), + ) val model = DataGenHelper.generateTripleModel(testCardinality) - RDFDataMgr.write(new FileOutputStream(tempFile.toFile), model, JellyLanguage.JELLY) + RDFDataMgr.write(new FileOutputStream(tempFile.toFile), model, customFormat) try { testCode(tempFile.toString) } finally { tempFile.toFile.delete() } diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala new file mode 100644 index 0000000..4065204 --- /dev/null +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala @@ -0,0 +1,38 @@ +package eu.neverblink.jelly.cli.command.rdf + +import eu.neverblink.jelly.cli.command.helpers.TestFixtureHelper +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec +import org.yaml.snakeyaml.Yaml + +class RdfInspectSpec extends AnyWordSpec with Matchers with TestFixtureHelper: + protected val testCardinality: Int = 33 + + "rdf inspect command" should { + "be able to return aggregate of all frames as a valid Yaml" in withFullJellyFile { j => + val (out, err) = RdfInspect.runTestCommand(List("rdf", "inspect", j)) + val yaml = new Yaml() + try { + yaml.load(out) + true + } catch { + case e: Exception => + fail("Failed to parse YAML output", e) + } + } + "be able to return all frames separately as a valid Yaml" in withFullJellyFile( + testCode = { j => + // I'll probably have to make a Jelly file with multiple frames to test it well + val (out, err) = RdfInspect.runTestCommand(List("rdf", "inspect", "--per-frame", j)) + val yaml = new Yaml() + try { + yaml.load(out) + true + } catch { + case e: Exception => + fail("Failed to parse YAML output", e) + } + }, + frameSize = 15, + ) + } From 0c55333a0f00c9a03c0d138aa39c0767e755f8bf Mon Sep 17 00:00:00 2001 From: Karolina Bogacka Date: Fri, 4 Apr 2025 14:49:03 +0200 Subject: [PATCH 2/6] Add all necessary functionality --- .../jelly/cli/command/rdf/RdfFromJelly.scala | 2 +- .../jelly/cli/command/rdf/RdfInspect.scala | 5 +- .../jelly/cli/command/rdf/RdfToJelly.scala | 2 +- ...ommand.scala => RdfTranscodeCommand.scala} | 4 +- .../neverblink/jelly/cli/util/JellyUtil.scala | 114 ++++++++++++------ .../jelly/cli/util/YamlDocBuilder.scala | 20 ++- .../cli/command/rdf/RdfInspectSpec.scala | 1 - 7 files changed, 95 insertions(+), 53 deletions(-) rename src/main/scala/eu/neverblink/jelly/cli/command/rdf/{RdfCommand.scala => RdfTranscodeCommand.scala} (95%) diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala index bd72c80..aa6db16 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala @@ -25,7 +25,7 @@ case class RdfFromJellyOptions( @ExtraName("out-format") outputFormat: Option[String] = None, ) extends HasJellyCommandOptions -object RdfFromJelly extends RdfCommand[RdfFromJellyOptions, RdfFormat.Writeable]: +object RdfFromJelly extends RdfTranscodeCommand[RdfFromJellyOptions, RdfFormat.Writeable]: override def names: List[List[String]] = List( List("rdf", "from-jelly"), diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala index e84dfcb..fd642df 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala @@ -20,9 +20,6 @@ object RdfInspect extends JellyCommand[RdfInspectOptions]: override def names: List[List[String]] = List( List("rdf", "inspect"), ) - // from what we've talked about yesterday it also sounded like we should accept some optional parameters - // specifying which frames / parts of the stream to compute the metrics for - // probably when only frameStart specified we should compute it only for this frame then override final def group = "rdf" @@ -33,7 +30,7 @@ object RdfInspect extends JellyCommand[RdfInspectOptions]: if options.perFrame then printer.printPerFrame(outputStream) else printer.printAggregate(outputStream) - def inspectJelly( + private def inspectJelly( inputStream: InputStream, ): MetricsPrinter = val printer = new MetricsPrinter diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala index a50f468..276c13b 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJelly.scala @@ -40,7 +40,7 @@ case class RdfToJellyOptions( delimited: Boolean = true, ) extends HasJellyCommandOptions -object RdfToJelly extends RdfCommand[RdfToJellyOptions, RdfFormat.Readable]: +object RdfToJelly extends RdfTranscodeCommand[RdfToJellyOptions, RdfFormat.Readable]: override def names: List[List[String]] = List( List("rdf", "to-jelly"), diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfCommand.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfTranscodeCommand.scala similarity index 95% rename from src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfCommand.scala rename to src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfTranscodeCommand.scala index 2696a0f..71a3c42 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfCommand.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfTranscodeCommand.scala @@ -12,8 +12,8 @@ import java.io.{InputStream, OutputStream} /** This abstract class is responsible for the common logic in both RDF parsing commands */ -abstract class RdfCommand[T <: HasJellyCommandOptions: {Parser, Help}, F <: RdfFormat](using - tt: TypeTest[RdfFormat, F], +abstract class RdfTranscodeCommand[T <: HasJellyCommandOptions: {Parser, Help}, F <: RdfFormat]( + using tt: TypeTest[RdfFormat, F], ) extends JellyCommand[T]: override final def group = "rdf" diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala b/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala index 119aec4..2abf216 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala @@ -1,5 +1,7 @@ package eu.neverblink.jelly.cli.util +import eu.neverblink.jelly.cli.util.MetricsPrinter.{formatOptions, formatStats} +import eu.neverblink.jelly.cli.util.YamlDocBuilder.YamlMap import eu.ostrzyciel.jelly.core.IoUtils import eu.ostrzyciel.jelly.core.proto.v1.{RdfStreamFrame, RdfStreamOptions} @@ -7,7 +9,6 @@ import java.io.{InputStream, OutputStream} import scala.collection.mutable.ListBuffer class FrameInfo: - var metadata: Map[String, String] = Map.empty var optionCount: Int = 0 var nameCount: Int = 0 var namespaceCount: Int = 0 @@ -18,6 +19,19 @@ class FrameInfo: var graphStartCount: Int = 0 var graphEndCount: Int = 0 + def +=(other: FrameInfo): FrameInfo = { + this.optionCount += other.optionCount + this.nameCount += other.nameCount + this.namespaceCount += other.namespaceCount + this.tripleCount += other.tripleCount + this.quadCount += other.quadCount + this.prefixCount += other.prefixCount + this.datatypeCount += other.datatypeCount + this.graphStartCount += other.graphStartCount + this.graphEndCount += other.graphEndCount + this + } + end FrameInfo class MetricsPrinter: @@ -26,56 +40,76 @@ class MetricsPrinter: var frameInfo: ListBuffer[FrameInfo] = ListBuffer.empty def printPerFrame(o: OutputStream): Unit = { - val parsedString = YamlDocBuilder.build(YamlDocBuilder.YamlList(frameInfo.map { frame => - YamlDocBuilder.YamlMap( - "optionCount" -> YamlDocBuilder.YamlString(frame.optionCount.toString), - "nameCount" -> YamlDocBuilder.YamlString(frame.nameCount.toString), - "namespaceCount" -> YamlDocBuilder.YamlString(frame.namespaceCount.toString), - "tripleCount" -> YamlDocBuilder.YamlString(frame.tripleCount.toString), - "quadCount" -> YamlDocBuilder.YamlString(frame.quadCount.toString), - "prefixCount" -> YamlDocBuilder.YamlString(frame.prefixCount.toString), - "datatypeCount" -> YamlDocBuilder.YamlString(frame.datatypeCount.toString), - "graphStartCount" -> YamlDocBuilder.YamlString(frame.graphStartCount.toString), - "graphEndCount" -> YamlDocBuilder.YamlString(frame.graphEndCount.toString), + // I don't really like this approach, better create the printer with the options + val options = formatOptions(options = printOptions.getOrElse(RdfStreamOptions())) + val yamlFrames = YamlDocBuilder.YamlList(frameInfo.map { frame => + formatStats(frame) + }.toSeq) + val fullString = + YamlDocBuilder.build( + YamlMap( + "stream_options" -> options, + "frames" -> yamlFrames, + ), ) - }.toSeq)) - o.write(parsedString.getBytes) + o.write(fullString.getBytes) } def printAggregate(o: OutputStream): Unit = { - val sumCounts = frameInfo.reduce((a, b) => { - a.optionCount += b.optionCount - a.nameCount += b.nameCount - a.namespaceCount += b.namespaceCount - a.tripleCount += b.tripleCount - a.quadCount += b.quadCount - a.prefixCount += b.prefixCount - a.datatypeCount += b.datatypeCount - a.graphStartCount += b.graphStartCount - a.graphEndCount += b.graphEndCount - a - }) - val parsedString = YamlDocBuilder.build( - YamlDocBuilder.YamlMap( - "optionCount" -> YamlDocBuilder.YamlString(sumCounts.optionCount.toString), - "nameCount" -> YamlDocBuilder.YamlString(sumCounts.nameCount.toString), - "nameSpaceCount" -> YamlDocBuilder.YamlString(sumCounts.namespaceCount.toString), - "tripleCount" -> YamlDocBuilder.YamlString(sumCounts.tripleCount.toString), - "quadCount" -> YamlDocBuilder.YamlString(sumCounts.quadCount.toString), - "prefixCount" -> YamlDocBuilder.YamlString(sumCounts.prefixCount.toString), - "datatypeCount" -> YamlDocBuilder.YamlString(sumCounts.datatypeCount.toString), - "graphStartCount" -> YamlDocBuilder.YamlString(sumCounts.graphStartCount.toString), - "graphEndCount" -> YamlDocBuilder.YamlString(sumCounts.graphEndCount.toString), - ), - ) - o.write(parsedString.getBytes) + val sumCounts = frameInfo.reduce(_ += _) + val options = formatOptions(options = printOptions.getOrElse(RdfStreamOptions())) + val fullString = + YamlDocBuilder.build( + YamlMap( + "stream_options" -> options, + "frames" -> formatStats(sumCounts), + ), + ) + o.write(fullString.getBytes) } end MetricsPrinter object MetricsPrinter: + /** This method converts a boolean to an integer + * @param b + * @return + */ + private def boolToInt(b: Boolean): Int = + if b then 1 else 0 + + def formatOptions( + options: RdfStreamOptions, + ): YamlDocBuilder.YamlMap = + YamlDocBuilder.YamlMap( + "rdf_star" -> YamlDocBuilder.YamlInt(boolToInt(options.rdfStar)), + "stream_name" -> YamlDocBuilder.YamlString(options.streamName), + "generalized_statements" -> YamlDocBuilder.YamlInt(boolToInt(options.generalizedStatements)), + "version" -> YamlDocBuilder.YamlInt(options.version), + "max_datatype_table_size" -> YamlDocBuilder.YamlInt(options.maxDatatypeTableSize), + "max_name_table_size" -> YamlDocBuilder.YamlInt(options.maxNameTableSize), + "max_prefix_table_size" -> YamlDocBuilder.YamlInt(options.maxPrefixTableSize), + "logical_type" -> YamlDocBuilder.YamlInt(options.logicalType.value), + "physical_type" -> YamlDocBuilder.YamlInt(options.physicalType.value), + ) + + def formatStats( + frame: FrameInfo, + ): YamlDocBuilder.YamlMap = + YamlDocBuilder.YamlMap( + "option_count" -> YamlDocBuilder.YamlInt(frame.optionCount), + "name_count" -> YamlDocBuilder.YamlInt(frame.nameCount), + "namespace_count" -> YamlDocBuilder.YamlInt(frame.namespaceCount), + "triple_count" -> YamlDocBuilder.YamlInt(frame.tripleCount), + "quad_count" -> YamlDocBuilder.YamlInt(frame.quadCount), + "prefix_count" -> YamlDocBuilder.YamlInt(frame.prefixCount), + "datatype_count" -> YamlDocBuilder.YamlInt(frame.datatypeCount), + "graph_start_count" -> YamlDocBuilder.YamlInt(frame.graphStartCount), + "graph_end_count" -> YamlDocBuilder.YamlInt(frame.graphEndCount), + ) + end MetricsPrinter object JellyUtil: diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala index bd5aa08..a6d66b3 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala @@ -6,13 +6,16 @@ object YamlDocBuilder: */ sealed trait YamlValue - case class YamlString(v: String) extends YamlValue + sealed trait YamlScalar extends YamlValue + case class YamlInt(v: Int) extends YamlScalar + case class YamlString(v: String) extends YamlScalar case class YamlList(v: Seq[YamlValue]) extends YamlValue object YamlMap: def apply(v: (String, YamlValue)*): YamlMap = YamlMap(v.toMap) def apply(k: String, v: String): YamlMap = YamlMap(Map(k -> YamlString(v))) + def apply(k: String, v: Int): YamlMap = YamlMap(Map(k -> YamlInt(v))) def apply(k: String, v: YamlValue): YamlMap = YamlMap(Map(k -> v)) case class YamlMap(v: Map[String, YamlValue]) extends YamlValue @@ -26,9 +29,13 @@ object YamlDocBuilder: root match case YamlString(v) => sb.append(quoteAndEscape(v)) + case YamlInt(v) => + sb.append(v) case YamlList(v) => sb.append(System.lineSeparator()) - v.foreach { e => + v.zipWithIndex.foreach { (e, index) => + sb.append(f"# frame ${index}") + sb.append(System.lineSeparator()) sb.append(" " * indent).append("- ") build(e, sb, indent + 1) if e != v.last then sb.append(System.lineSeparator()) @@ -36,9 +43,14 @@ object YamlDocBuilder: case YamlMap(v) => v.zipWithIndex.foreach { case ((k, e), ix) => if ix != 0 then sb.append(" " * indent) - sb.append(quoteAndEscape(k)) + sb.append(k) sb.append(": ") - build(e, sb, indent + 1) + if e.isInstanceOf[YamlMap] then + // If a map nested inside a map we have to indent it properly + sb.append(System.lineSeparator()) + sb.append(" " * (indent + 1)) + build(e, sb, indent + 1) + else build(e, sb, indent + 1) if ix != v.size - 1 then sb.append(System.lineSeparator()) } diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala index 4065204..cd49179 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala @@ -22,7 +22,6 @@ class RdfInspectSpec extends AnyWordSpec with Matchers with TestFixtureHelper: } "be able to return all frames separately as a valid Yaml" in withFullJellyFile( testCode = { j => - // I'll probably have to make a Jelly file with multiple frames to test it well val (out, err) = RdfInspect.runTestCommand(List("rdf", "inspect", "--per-frame", j)) val yaml = new Yaml() try { From da9dcfcf9e70b2d2d4778e1e7f19e10e18eed47f Mon Sep 17 00:00:00 2001 From: Karolina Bogacka Date: Fri, 4 Apr 2025 18:49:46 +0200 Subject: [PATCH 3/6] Add final polish --- .../eu/neverblink/jelly/cli/Exceptions.scala | 3 +- .../jelly/cli/command/rdf/RdfInspect.scala | 43 +++++-- .../neverblink/jelly/cli/util/JellyUtil.scala | 111 +----------------- .../jelly/cli/util/MetricsPrinter.scala | 110 +++++++++++++++++ .../jelly/cli/util/YamlDocBuilder.scala | 1 + .../cli/command/rdf/RdfInspectSpec.scala | 54 ++++++--- 6 files changed, 186 insertions(+), 136 deletions(-) create mode 100644 src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala diff --git a/src/main/scala/eu/neverblink/jelly/cli/Exceptions.scala b/src/main/scala/eu/neverblink/jelly/cli/Exceptions.scala index c741178..b7f6dd2 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/Exceptions.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/Exceptions.scala @@ -1,6 +1,5 @@ package eu.neverblink.jelly.cli -import com.google.protobuf.InvalidProtocolBufferException import org.apache.jena.riot.RiotException /** Contains a set of common jelly-cli exceptions with custom output messages. @@ -22,7 +21,7 @@ case class JellyTranscodingError(message: String) extends CriticalException(s"Jelly transcoding error: $message") case class JenaRiotException(e: RiotException) extends CriticalException(s"Jena RDF I/O exception: ${e.getMessage}") -case class InvalidJellyFile(e: InvalidProtocolBufferException) +case class InvalidJellyFile(e: Exception) extends CriticalException(s"Invalid Jelly file: ${e.getMessage}") case class InvalidFormatSpecified(format: String, validFormats: String) extends CriticalException( diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala index fd642df..55131e0 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala @@ -3,7 +3,7 @@ package eu.neverblink.jelly.cli.command.rdf import caseapp.{ExtraName, Recurse} import caseapp.core.RemainingArgs import eu.neverblink.jelly.cli.util.{FrameInfo, JellyUtil, MetricsPrinter} -import eu.neverblink.jelly.cli.{HasJellyCommandOptions, JellyCommand, JellyCommandOptions} +import eu.neverblink.jelly.cli.* import eu.ostrzyciel.jelly.core.proto.v1.* import java.io.InputStream @@ -33,26 +33,30 @@ object RdfInspect extends JellyCommand[RdfInspectOptions]: private def inspectJelly( inputStream: InputStream, ): MetricsPrinter = - val printer = new MetricsPrinter - // Here we can easily compute overall metrics - inline def computeMetrics(frame: RdfStreamFrame, frameIndex: Int): Unit = - if printer.printOptions.isEmpty then - if frame.rows.nonEmpty && frame.rows.head.row.isOptions then - printer.printOptions = Some(frame.rows.head.row.asInstanceOf[RdfStreamOptions]) - else throw new RuntimeException("First row of the frame is not an options row") + inline def computeMetrics( + frame: RdfStreamFrame, + frameIndex: Int, + printer: MetricsPrinter, + ): Unit = val metrics = new FrameInfo() frame.rows.foreach(r => metricsForRow(r, metrics)) printer.frameInfo += metrics try { - JellyUtil.iterateRdfStream(inputStream).zipWithIndex.foreach { - case (maybeFrame, frameIndex) => computeMetrics(maybeFrame, frameIndex) + val allRows = JellyUtil.iterateRdfStream(inputStream).toList + // we need to check if the first frame contains options + val streamOptions = checkOptions(allRows) + val printer = new MetricsPrinter(streamOptions) + // We compute the metrics for each frame + // and then sum them all during the printing if desired + allRows.zipWithIndex.foreach { case (maybeFrame, frameIndex) => + computeMetrics(maybeFrame, frameIndex, printer) } printer } catch { case e: Exception => - throw new RuntimeException("Error inspecting Jelly file", e) + throw InvalidJellyFile(e) } private def metricsForRow( @@ -70,3 +74,20 @@ object RdfInspect extends JellyCommand[RdfInspectOptions]: case r: RdfGraphEnd => metadata.graphEndCount += 1 case r: RdfStreamOptions => metadata.optionCount += 1 } + + /** Checks whether the first frame in the stream contains options and returns them. + * @param allFrames + * The list of all frames in the stream. + * @return + * The options from the first frame. + * @throws RuntimeException + * If the first frame does not contain options or if there are no frames in the stream. + */ + private def checkOptions(allFrames: List[RdfStreamFrame]): RdfStreamOptions = + if allFrames.isEmpty then throw new RuntimeException("No frames in the stream.") + if allFrames.head.rows.isEmpty then throw new RuntimeException("No rows in the frame.") + val frameRows = allFrames.head.rows + frameRows.head.row match { + case r: RdfStreamOptions => r + case _ => throw new RuntimeException("First row of the frame is not an options row.") + } diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala b/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala index 2abf216..5924b84 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/JellyUtil.scala @@ -1,116 +1,9 @@ package eu.neverblink.jelly.cli.util -import eu.neverblink.jelly.cli.util.MetricsPrinter.{formatOptions, formatStats} -import eu.neverblink.jelly.cli.util.YamlDocBuilder.YamlMap import eu.ostrzyciel.jelly.core.IoUtils -import eu.ostrzyciel.jelly.core.proto.v1.{RdfStreamFrame, RdfStreamOptions} +import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamFrame -import java.io.{InputStream, OutputStream} -import scala.collection.mutable.ListBuffer - -class FrameInfo: - var optionCount: Int = 0 - var nameCount: Int = 0 - var namespaceCount: Int = 0 - var tripleCount: Int = 0 - var quadCount: Int = 0 - var prefixCount: Int = 0 - var datatypeCount: Int = 0 - var graphStartCount: Int = 0 - var graphEndCount: Int = 0 - - def +=(other: FrameInfo): FrameInfo = { - this.optionCount += other.optionCount - this.nameCount += other.nameCount - this.namespaceCount += other.namespaceCount - this.tripleCount += other.tripleCount - this.quadCount += other.quadCount - this.prefixCount += other.prefixCount - this.datatypeCount += other.datatypeCount - this.graphStartCount += other.graphStartCount - this.graphEndCount += other.graphEndCount - this - } - -end FrameInfo - -class MetricsPrinter: - - var printOptions: Option[RdfStreamOptions] = None - var frameInfo: ListBuffer[FrameInfo] = ListBuffer.empty - - def printPerFrame(o: OutputStream): Unit = { - // I don't really like this approach, better create the printer with the options - val options = formatOptions(options = printOptions.getOrElse(RdfStreamOptions())) - val yamlFrames = YamlDocBuilder.YamlList(frameInfo.map { frame => - formatStats(frame) - }.toSeq) - val fullString = - YamlDocBuilder.build( - YamlMap( - "stream_options" -> options, - "frames" -> yamlFrames, - ), - ) - o.write(fullString.getBytes) - - } - - def printAggregate(o: OutputStream): Unit = { - val sumCounts = frameInfo.reduce(_ += _) - val options = formatOptions(options = printOptions.getOrElse(RdfStreamOptions())) - val fullString = - YamlDocBuilder.build( - YamlMap( - "stream_options" -> options, - "frames" -> formatStats(sumCounts), - ), - ) - o.write(fullString.getBytes) - } - -end MetricsPrinter - -object MetricsPrinter: - - /** This method converts a boolean to an integer - * @param b - * @return - */ - private def boolToInt(b: Boolean): Int = - if b then 1 else 0 - - def formatOptions( - options: RdfStreamOptions, - ): YamlDocBuilder.YamlMap = - YamlDocBuilder.YamlMap( - "rdf_star" -> YamlDocBuilder.YamlInt(boolToInt(options.rdfStar)), - "stream_name" -> YamlDocBuilder.YamlString(options.streamName), - "generalized_statements" -> YamlDocBuilder.YamlInt(boolToInt(options.generalizedStatements)), - "version" -> YamlDocBuilder.YamlInt(options.version), - "max_datatype_table_size" -> YamlDocBuilder.YamlInt(options.maxDatatypeTableSize), - "max_name_table_size" -> YamlDocBuilder.YamlInt(options.maxNameTableSize), - "max_prefix_table_size" -> YamlDocBuilder.YamlInt(options.maxPrefixTableSize), - "logical_type" -> YamlDocBuilder.YamlInt(options.logicalType.value), - "physical_type" -> YamlDocBuilder.YamlInt(options.physicalType.value), - ) - - def formatStats( - frame: FrameInfo, - ): YamlDocBuilder.YamlMap = - YamlDocBuilder.YamlMap( - "option_count" -> YamlDocBuilder.YamlInt(frame.optionCount), - "name_count" -> YamlDocBuilder.YamlInt(frame.nameCount), - "namespace_count" -> YamlDocBuilder.YamlInt(frame.namespaceCount), - "triple_count" -> YamlDocBuilder.YamlInt(frame.tripleCount), - "quad_count" -> YamlDocBuilder.YamlInt(frame.quadCount), - "prefix_count" -> YamlDocBuilder.YamlInt(frame.prefixCount), - "datatype_count" -> YamlDocBuilder.YamlInt(frame.datatypeCount), - "graph_start_count" -> YamlDocBuilder.YamlInt(frame.graphStartCount), - "graph_end_count" -> YamlDocBuilder.YamlInt(frame.graphEndCount), - ) - -end MetricsPrinter +import java.io.InputStream object JellyUtil: /** This method reads the Jelly file and returns an iterator of RdfStreamFrame diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala b/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala new file mode 100644 index 0000000..e917b45 --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala @@ -0,0 +1,110 @@ +package eu.neverblink.jelly.cli.util + +import eu.neverblink.jelly.cli.util.MetricsPrinter.{formatOptions, formatStats} +import eu.neverblink.jelly.cli.util.YamlDocBuilder.YamlMap +import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamOptions + +import java.io.OutputStream +import scala.collection.mutable.ListBuffer + +/** This class is used to store the metrics for a single frame + */ +class FrameInfo: + var optionCount: Int = 0 + var nameCount: Int = 0 + var namespaceCount: Int = 0 + var tripleCount: Int = 0 + var quadCount: Int = 0 + var prefixCount: Int = 0 + var datatypeCount: Int = 0 + var graphStartCount: Int = 0 + var graphEndCount: Int = 0 + + def +=(other: FrameInfo): FrameInfo = { + this.optionCount += other.optionCount + this.nameCount += other.nameCount + this.namespaceCount += other.namespaceCount + this.tripleCount += other.tripleCount + this.quadCount += other.quadCount + this.prefixCount += other.prefixCount + this.datatypeCount += other.datatypeCount + this.graphStartCount += other.graphStartCount + this.graphEndCount += other.graphEndCount + this + } + +end FrameInfo + +class MetricsPrinter(printOptions: RdfStreamOptions): + + var frameInfo: ListBuffer[FrameInfo] = ListBuffer.empty + + def printPerFrame(o: OutputStream): Unit = { + val options = formatOptions(options = printOptions) + val yamlFrames = YamlDocBuilder.YamlList(frameInfo.map { frame => + formatStats(frame) + }.toSeq) + val fullString = + YamlDocBuilder.build( + YamlMap( + "stream_options" -> options, + "frames" -> yamlFrames, + ), + ) + o.write(fullString.getBytes) + + } + + def printAggregate(o: OutputStream): Unit = { + val sumCounts = frameInfo.reduce(_ += _) + val options = formatOptions(options = printOptions) + val fullString = + YamlDocBuilder.build( + YamlMap( + "stream_options" -> options, + "frames" -> formatStats(sumCounts), + ), + ) + o.write(fullString.getBytes) + } + +end MetricsPrinter + +object MetricsPrinter: + + /** This method converts a boolean to an integer + */ + private def boolToInt(b: Boolean): Int = + if b then 1 else 0 + + def formatOptions( + options: RdfStreamOptions, + ): YamlDocBuilder.YamlMap = + YamlDocBuilder.YamlMap( + "rdf_star" -> YamlDocBuilder.YamlInt(boolToInt(options.rdfStar)), + "stream_name" -> YamlDocBuilder.YamlString(options.streamName), + "generalized_statements" -> YamlDocBuilder.YamlInt(boolToInt(options.generalizedStatements)), + "version" -> YamlDocBuilder.YamlInt(options.version), + "max_datatype_table_size" -> YamlDocBuilder.YamlInt(options.maxDatatypeTableSize), + "max_name_table_size" -> YamlDocBuilder.YamlInt(options.maxNameTableSize), + "max_prefix_table_size" -> YamlDocBuilder.YamlInt(options.maxPrefixTableSize), + "logical_type" -> YamlDocBuilder.YamlInt(options.logicalType.value), + "physical_type" -> YamlDocBuilder.YamlInt(options.physicalType.value), + ) + + def formatStats( + frame: FrameInfo, + ): YamlDocBuilder.YamlMap = + YamlDocBuilder.YamlMap( + "option_count" -> YamlDocBuilder.YamlInt(frame.optionCount), + "name_count" -> YamlDocBuilder.YamlInt(frame.nameCount), + "namespace_count" -> YamlDocBuilder.YamlInt(frame.namespaceCount), + "triple_count" -> YamlDocBuilder.YamlInt(frame.tripleCount), + "quad_count" -> YamlDocBuilder.YamlInt(frame.quadCount), + "prefix_count" -> YamlDocBuilder.YamlInt(frame.prefixCount), + "datatype_count" -> YamlDocBuilder.YamlInt(frame.datatypeCount), + "graph_start_count" -> YamlDocBuilder.YamlInt(frame.graphStartCount), + "graph_end_count" -> YamlDocBuilder.YamlInt(frame.graphEndCount), + ) + +end MetricsPrinter diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala index a6d66b3..3720f5c 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala @@ -34,6 +34,7 @@ object YamlDocBuilder: case YamlList(v) => sb.append(System.lineSeparator()) v.zipWithIndex.foreach { (e, index) => + // We want to add a comment about which frame we're summing up sb.append(f"# frame ${index}") sb.append(System.lineSeparator()) sb.append(" " * indent).append("- ") diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala index cd49179..74f5378 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala @@ -1,10 +1,15 @@ package eu.neverblink.jelly.cli.command.rdf +import eu.neverblink.jelly.cli.{ExitException, InvalidJellyFile} import eu.neverblink.jelly.cli.command.helpers.TestFixtureHelper + +import scala.jdk.CollectionConverters.* import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec import org.yaml.snakeyaml.Yaml +import java.util + class RdfInspectSpec extends AnyWordSpec with Matchers with TestFixtureHelper: protected val testCardinality: Int = 33 @@ -12,26 +17,47 @@ class RdfInspectSpec extends AnyWordSpec with Matchers with TestFixtureHelper: "be able to return aggregate of all frames as a valid Yaml" in withFullJellyFile { j => val (out, err) = RdfInspect.runTestCommand(List("rdf", "inspect", j)) val yaml = new Yaml() - try { - yaml.load(out) - true - } catch { - case e: Exception => - fail("Failed to parse YAML output", e) - } + val parsed = yaml.load(out).asInstanceOf[java.util.Map[String, Any]] + parsed.get("stream_options") should not be None + val options = parsed.get("stream_options").asInstanceOf[java.util.Map[String, Any]] + options.get("max_name_table_size") should be(128) + parsed.get("frames") shouldBe a[util.LinkedHashMap[?, ?]] + val frames = parsed.get("frames").asInstanceOf[java.util.LinkedHashMap[String, Any]] + frames.get("triple_count") should be(testCardinality) + } "be able to return all frames separately as a valid Yaml" in withFullJellyFile( testCode = { j => val (out, err) = RdfInspect.runTestCommand(List("rdf", "inspect", "--per-frame", j)) val yaml = new Yaml() - try { - yaml.load(out) - true - } catch { - case e: Exception => - fail("Failed to parse YAML output", e) - } + val parsed = yaml.load(out).asInstanceOf[java.util.Map[String, Any]] + parsed.get("stream_options") should not be None + parsed.get("frames") shouldBe a[util.ArrayList[Map[String, Int]]] + val frames = + parsed.get("frames").asInstanceOf[util.ArrayList[util.HashMap[String, Int]]].asScala + frames.length should be <= 5 + frames.map(_.get("triple_count")).sum should be(testCardinality) + }, frameSize = 15, ) + "handle properly separate frame metrics for a singular frame" in withFullJellyFile { j => + val (out, err) = RdfInspect.runTestCommand(List("rdf", "inspect", "--per-frame", j)) + val yaml = new Yaml() + val parsed = yaml.load(out).asInstanceOf[java.util.Map[String, Any]] + parsed.get("stream_options") should not be None + parsed.get("frames") shouldBe a[util.ArrayList[?]] + val frames = + parsed.get("frames").asInstanceOf[util.ArrayList[util.HashMap[String, Int]]].asScala + frames.length should be(1) + frames.map(_.get("triple_count")).sum should be(testCardinality) + } + "throw an error if the input file is not a valid jelly file" in withEmptyJellyFile { j => + val exception = intercept[ExitException] { + RdfInspect.runTestCommand(List("rdf", "inspect", j, "--debug")) + } + val err = RdfInspect.getErrString + val msg = InvalidJellyFile(RuntimeException("")).getMessage + err should include(msg) + } } From 47300bcdd7b0e0b9e3a2eeeb45905ded0d362d9f Mon Sep 17 00:00:00 2001 From: Karolina Bogacka Date: Fri, 4 Apr 2025 22:45:08 +0200 Subject: [PATCH 4/6] Correct comments --- .../jelly/cli/command/rdf/RdfInspect.scala | 2 +- .../jelly/cli/util/MetricsPrinter.scala | 77 ++++++++++--------- .../jelly/cli/util/YamlDocBuilder.scala | 25 +++--- .../cli/command/rdf/RdfInspectSpec.scala | 5 +- 4 files changed, 60 insertions(+), 49 deletions(-) diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala index 55131e0..e5d09ee 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala @@ -39,7 +39,7 @@ object RdfInspect extends JellyCommand[RdfInspectOptions]: frameIndex: Int, printer: MetricsPrinter, ): Unit = - val metrics = new FrameInfo() + val metrics = new FrameInfo(frameIndex) frame.rows.foreach(r => metricsForRow(r, metrics)) printer.frameInfo += metrics diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala b/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala index e917b45..cc4f213 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala @@ -1,7 +1,6 @@ package eu.neverblink.jelly.cli.util -import eu.neverblink.jelly.cli.util.MetricsPrinter.{formatOptions, formatStats} -import eu.neverblink.jelly.cli.util.YamlDocBuilder.YamlMap +import eu.neverblink.jelly.cli.util.YamlDocBuilder.* import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamOptions import java.io.OutputStream @@ -9,7 +8,7 @@ import scala.collection.mutable.ListBuffer /** This class is used to store the metrics for a single frame */ -class FrameInfo: +final class FrameInfo(val frameIndex: Int): var optionCount: Int = 0 var nameCount: Int = 0 var namespaceCount: Int = 0 @@ -35,14 +34,15 @@ class FrameInfo: end FrameInfo -class MetricsPrinter(printOptions: RdfStreamOptions): +final class MetricsPrinter(printOptions: RdfStreamOptions): + import eu.neverblink.jelly.cli.util.MetricsPrinter.* var frameInfo: ListBuffer[FrameInfo] = ListBuffer.empty def printPerFrame(o: OutputStream): Unit = { val options = formatOptions(options = printOptions) val yamlFrames = YamlDocBuilder.YamlList(frameInfo.map { frame => - formatStats(frame) + formatStatsIndex(frame) }.toSeq) val fullString = YamlDocBuilder.build( @@ -56,13 +56,14 @@ class MetricsPrinter(printOptions: RdfStreamOptions): } def printAggregate(o: OutputStream): Unit = { + val frameCount = frameInfo.length val sumCounts = frameInfo.reduce(_ += _) val options = formatOptions(options = printOptions) val fullString = YamlDocBuilder.build( YamlMap( "stream_options" -> options, - "frames" -> formatStats(sumCounts), + "frames" -> formatStatsCount(sumCounts, frameCount), ), ) o.write(fullString.getBytes) @@ -72,39 +73,45 @@ end MetricsPrinter object MetricsPrinter: - /** This method converts a boolean to an integer - */ - private def boolToInt(b: Boolean): Int = - if b then 1 else 0 - - def formatOptions( + private def formatOptions( options: RdfStreamOptions, - ): YamlDocBuilder.YamlMap = - YamlDocBuilder.YamlMap( - "rdf_star" -> YamlDocBuilder.YamlInt(boolToInt(options.rdfStar)), - "stream_name" -> YamlDocBuilder.YamlString(options.streamName), - "generalized_statements" -> YamlDocBuilder.YamlInt(boolToInt(options.generalizedStatements)), - "version" -> YamlDocBuilder.YamlInt(options.version), - "max_datatype_table_size" -> YamlDocBuilder.YamlInt(options.maxDatatypeTableSize), - "max_name_table_size" -> YamlDocBuilder.YamlInt(options.maxNameTableSize), - "max_prefix_table_size" -> YamlDocBuilder.YamlInt(options.maxPrefixTableSize), - "logical_type" -> YamlDocBuilder.YamlInt(options.logicalType.value), - "physical_type" -> YamlDocBuilder.YamlInt(options.physicalType.value), + ): YamlMap = + YamlMap( + "stream_name" -> YamlString(options.streamName), + "physical_type" -> YamlEnum(options.physicalType.toString, options.physicalType.value), + "generalized_statements" -> YamlBool(options.generalizedStatements), + "rdf_star" -> YamlBool(options.rdfStar), + "max_name_table_size" -> YamlInt(options.maxNameTableSize), + "max_prefix_table_size" -> YamlInt(options.maxPrefixTableSize), + "max_datatype_table_size" -> YamlInt(options.maxDatatypeTableSize), + "logical_type" -> YamlEnum(options.logicalType.toString, options.logicalType.value), + "version" -> YamlInt(options.version), ) - def formatStats( + private def formatStatsIndex( + frame: FrameInfo, + ): YamlMap = + YamlMap(Seq(("frame_index", YamlInt(frame.frameIndex))) ++ formatStats(frame)*) + + private def formatStatsCount( + frame: FrameInfo, + frameCount: Int, + ): YamlMap = + YamlMap(Seq(("frame_count", YamlInt(frameCount))) ++ formatStats(frame)*) + + private def formatStats( frame: FrameInfo, - ): YamlDocBuilder.YamlMap = - YamlDocBuilder.YamlMap( - "option_count" -> YamlDocBuilder.YamlInt(frame.optionCount), - "name_count" -> YamlDocBuilder.YamlInt(frame.nameCount), - "namespace_count" -> YamlDocBuilder.YamlInt(frame.namespaceCount), - "triple_count" -> YamlDocBuilder.YamlInt(frame.tripleCount), - "quad_count" -> YamlDocBuilder.YamlInt(frame.quadCount), - "prefix_count" -> YamlDocBuilder.YamlInt(frame.prefixCount), - "datatype_count" -> YamlDocBuilder.YamlInt(frame.datatypeCount), - "graph_start_count" -> YamlDocBuilder.YamlInt(frame.graphStartCount), - "graph_end_count" -> YamlDocBuilder.YamlInt(frame.graphEndCount), + ): Seq[(String, YamlValue)] = + Seq( + ("option_count", YamlInt(frame.optionCount)), + ("triple_count", YamlInt(frame.tripleCount)), + ("quad_count", YamlInt(frame.quadCount)), + ("graph_start_count", YamlInt(frame.graphStartCount)), + ("graph_end_count", YamlInt(frame.graphEndCount)), + ("namespace_count", YamlInt(frame.namespaceCount)), + ("name_count", YamlInt(frame.nameCount)), + ("prefix_count", YamlInt(frame.prefixCount)), + ("datatype_count", YamlInt(frame.datatypeCount)), ) end MetricsPrinter diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala index 3720f5c..61c008e 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala @@ -1,24 +1,28 @@ package eu.neverblink.jelly.cli.util +import scala.collection.mutable + object YamlDocBuilder: /** A lightweight YAML document builder based on - * https://github.com/RiverBench/ci-worker/blob/main/src%2Fmain%2Fscala%2Futil%2FYamlDocBuilder.scala + * https://github.com/RiverBench/ci-worker/blob/2d57a085f65a6eabbfe76f2de6794f025b211f4e/src/main/scala/util/YamlDocBuilder.scala#L4 */ sealed trait YamlValue sealed trait YamlScalar extends YamlValue + case class YamlEnum(v: String, i: Int) extends YamlScalar case class YamlInt(v: Int) extends YamlScalar + case class YamlBool(v: Boolean) extends YamlScalar case class YamlString(v: String) extends YamlScalar case class YamlList(v: Seq[YamlValue]) extends YamlValue object YamlMap: - def apply(v: (String, YamlValue)*): YamlMap = YamlMap(v.toMap) - def apply(k: String, v: String): YamlMap = YamlMap(Map(k -> YamlString(v))) - def apply(k: String, v: Int): YamlMap = YamlMap(Map(k -> YamlInt(v))) - def apply(k: String, v: YamlValue): YamlMap = YamlMap(Map(k -> v)) + def apply(v: (String, YamlValue)*): YamlMap = YamlMap(v.to(mutable.LinkedHashMap)) + def apply(k: String, v: String): YamlMap = YamlMap(mutable.LinkedHashMap(k -> YamlString(v))) + def apply(k: String, v: Int): YamlMap = YamlMap(mutable.LinkedHashMap(k -> YamlInt(v))) + def apply(k: String, v: YamlValue): YamlMap = YamlMap(mutable.LinkedHashMap(k -> v)) - case class YamlMap(v: Map[String, YamlValue]) extends YamlValue + case class YamlMap(v: mutable.LinkedHashMap[String, YamlValue]) extends YamlValue def build(root: YamlValue): String = val sb = new StringBuilder @@ -31,14 +35,16 @@ object YamlDocBuilder: sb.append(quoteAndEscape(v)) case YamlInt(v) => sb.append(v) + case YamlBool(v) => + sb.append(v.toString) + case YamlEnum(v, i) => + sb.append(f"${v} (${i})") case YamlList(v) => sb.append(System.lineSeparator()) v.zipWithIndex.foreach { (e, index) => - // We want to add a comment about which frame we're summing up - sb.append(f"# frame ${index}") - sb.append(System.lineSeparator()) sb.append(" " * indent).append("- ") build(e, sb, indent + 1) + sb.append(System.lineSeparator()) if e != v.last then sb.append(System.lineSeparator()) } case YamlMap(v) => @@ -51,6 +57,7 @@ object YamlDocBuilder: sb.append(System.lineSeparator()) sb.append(" " * (indent + 1)) build(e, sb, indent + 1) + sb.append(System.lineSeparator()) else build(e, sb, indent + 1) if ix != v.size - 1 then sb.append(System.lineSeparator()) } diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala index 74f5378..78dad38 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala @@ -24,7 +24,6 @@ class RdfInspectSpec extends AnyWordSpec with Matchers with TestFixtureHelper: parsed.get("frames") shouldBe a[util.LinkedHashMap[?, ?]] val frames = parsed.get("frames").asInstanceOf[java.util.LinkedHashMap[String, Any]] frames.get("triple_count") should be(testCardinality) - } "be able to return all frames separately as a valid Yaml" in withFullJellyFile( testCode = { j => @@ -37,7 +36,6 @@ class RdfInspectSpec extends AnyWordSpec with Matchers with TestFixtureHelper: parsed.get("frames").asInstanceOf[util.ArrayList[util.HashMap[String, Int]]].asScala frames.length should be <= 5 frames.map(_.get("triple_count")).sum should be(testCardinality) - }, frameSize = 15, ) @@ -56,8 +54,7 @@ class RdfInspectSpec extends AnyWordSpec with Matchers with TestFixtureHelper: val exception = intercept[ExitException] { RdfInspect.runTestCommand(List("rdf", "inspect", j, "--debug")) } - val err = RdfInspect.getErrString val msg = InvalidJellyFile(RuntimeException("")).getMessage - err should include(msg) + exception.getMessage should include(msg) } } From 4b06355efb12276833dd6758cc19404bc2329bf9 Mon Sep 17 00:00:00 2001 From: Karolina Bogacka Date: Sat, 5 Apr 2025 16:56:49 +0200 Subject: [PATCH 5/6] Change computation to lazy --- .../jelly/cli/command/rdf/RdfInspect.scala | 36 +++++----- .../jelly/cli/util/MetricsPrinter.scala | 65 +++++++++++-------- .../jelly/cli/util/YamlDocBuilder.scala | 28 ++++---- .../cli/command/rdf/RdfInspectSpec.scala | 17 ++++- 4 files changed, 89 insertions(+), 57 deletions(-) diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala index e5d09ee..8065104 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspect.scala @@ -26,34 +26,32 @@ object RdfInspect extends JellyCommand[RdfInspectOptions]: override def doRun(options: RdfInspectOptions, remainingArgs: RemainingArgs): Unit = val (inputStream, outputStream) = this.getIoStreamsFromOptions(remainingArgs.remaining.headOption, options.outputFile) - val printer = inspectJelly(inputStream) - if options.perFrame then printer.printPerFrame(outputStream) - else printer.printAggregate(outputStream) + val (streamOpts, frameIterator) = inspectJelly(inputStream) + if options.perFrame then MetricsPrinter.printPerFrame(streamOpts, frameIterator, outputStream) + else MetricsPrinter.printAggregate(streamOpts, frameIterator, outputStream) private def inspectJelly( inputStream: InputStream, - ): MetricsPrinter = + ): (RdfStreamOptions, Iterator[FrameInfo]) = inline def computeMetrics( frame: RdfStreamFrame, frameIndex: Int, - printer: MetricsPrinter, - ): Unit = + ): FrameInfo = val metrics = new FrameInfo(frameIndex) frame.rows.foreach(r => metricsForRow(r, metrics)) - printer.frameInfo += metrics + metrics try { - val allRows = JellyUtil.iterateRdfStream(inputStream).toList + val allRows = JellyUtil.iterateRdfStream(inputStream).buffered // we need to check if the first frame contains options - val streamOptions = checkOptions(allRows) - val printer = new MetricsPrinter(streamOptions) + val streamOptions = checkOptions(allRows.headOption) // We compute the metrics for each frame // and then sum them all during the printing if desired - allRows.zipWithIndex.foreach { case (maybeFrame, frameIndex) => - computeMetrics(maybeFrame, frameIndex, printer) + val frameIterator = allRows.zipWithIndex.map { case (maybeFrame, frameIndex) => + computeMetrics(maybeFrame, frameIndex) } - printer + (streamOptions, frameIterator) } catch { case e: Exception => throw InvalidJellyFile(e) @@ -76,17 +74,17 @@ object RdfInspect extends JellyCommand[RdfInspectOptions]: } /** Checks whether the first frame in the stream contains options and returns them. - * @param allFrames - * The list of all frames in the stream. + * @param headFrame + * The first frame in the stream as an option. * @return * The options from the first frame. * @throws RuntimeException * If the first frame does not contain options or if there are no frames in the stream. */ - private def checkOptions(allFrames: List[RdfStreamFrame]): RdfStreamOptions = - if allFrames.isEmpty then throw new RuntimeException("No frames in the stream.") - if allFrames.head.rows.isEmpty then throw new RuntimeException("No rows in the frame.") - val frameRows = allFrames.head.rows + private def checkOptions(headFrame: Option[RdfStreamFrame]): RdfStreamOptions = + if headFrame.isEmpty then throw new RuntimeException("No frames in the stream.") + if headFrame.get.rows.isEmpty then throw new RuntimeException("No rows in the frame.") + val frameRows = headFrame.get.rows frameRows.head.row match { case r: RdfStreamOptions => r case _ => throw new RuntimeException("First row of the frame is not an options row.") diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala b/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala index cc4f213..0997a23 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala @@ -4,11 +4,11 @@ import eu.neverblink.jelly.cli.util.YamlDocBuilder.* import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamOptions import java.io.OutputStream -import scala.collection.mutable.ListBuffer /** This class is used to store the metrics for a single frame */ final class FrameInfo(val frameIndex: Int): + var frameCount: Int = 1 var optionCount: Int = 0 var nameCount: Int = 0 var namespaceCount: Int = 0 @@ -20,6 +20,7 @@ final class FrameInfo(val frameIndex: Int): var graphEndCount: Int = 0 def +=(other: FrameInfo): FrameInfo = { + this.frameCount += 1 this.optionCount += other.optionCount this.nameCount += other.nameCount this.namespaceCount += other.namespaceCount @@ -34,44 +35,57 @@ final class FrameInfo(val frameIndex: Int): end FrameInfo -final class MetricsPrinter(printOptions: RdfStreamOptions): - import eu.neverblink.jelly.cli.util.MetricsPrinter.* - - var frameInfo: ListBuffer[FrameInfo] = ListBuffer.empty +object MetricsPrinter: - def printPerFrame(o: OutputStream): Unit = { - val options = formatOptions(options = printOptions) - val yamlFrames = YamlDocBuilder.YamlList(frameInfo.map { frame => - formatStatsIndex(frame) - }.toSeq) - val fullString = + def printPerFrame( + options: RdfStreamOptions, + iterator: Iterator[FrameInfo], + o: OutputStream, + ): Unit = + printOptions(options, o) + val (fullString, indent) = YamlDocBuilder.build( YamlMap( - "stream_options" -> options, - "frames" -> yamlFrames, + "frames" -> YamlBlank(), + ), + ) + o.write(fullString.getBytes) + iterator.foreach { frame => + val yamlFrame = YamlListElem(formatStatsIndex(frame)) + val (fullString, _) = YamlDocBuilder.build(yamlFrame, indent) + o.write(fullString.getBytes) + o.write(System.lineSeparator().getBytes) + } + + def printAggregate( + options: RdfStreamOptions, + iterator: Iterator[FrameInfo], + o: OutputStream, + ): Unit = { + printOptions(options, o) + val sumCounts = iterator.reduce((a, b) => a += b) + val (fullString, _) = + YamlDocBuilder.build( + YamlMap( + "frames" -> formatStatsCount(sumCounts), ), ) o.write(fullString.getBytes) - } - def printAggregate(o: OutputStream): Unit = { - val frameCount = frameInfo.length - val sumCounts = frameInfo.reduce(_ += _) + private def printOptions( + printOptions: RdfStreamOptions, + o: OutputStream, + ): Unit = val options = formatOptions(options = printOptions) - val fullString = + val (fullString, _) = YamlDocBuilder.build( YamlMap( "stream_options" -> options, - "frames" -> formatStatsCount(sumCounts, frameCount), ), ) o.write(fullString.getBytes) - } - -end MetricsPrinter - -object MetricsPrinter: + o.write(System.lineSeparator().getBytes) private def formatOptions( options: RdfStreamOptions, @@ -95,9 +109,8 @@ object MetricsPrinter: private def formatStatsCount( frame: FrameInfo, - frameCount: Int, ): YamlMap = - YamlMap(Seq(("frame_count", YamlInt(frameCount))) ++ formatStats(frame)*) + YamlMap(Seq(("frame_count", YamlInt(frame.frameCount))) ++ formatStats(frame)*) private def formatStats( frame: FrameInfo, diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala index 61c008e..10f3566 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala @@ -9,12 +9,14 @@ object YamlDocBuilder: sealed trait YamlValue sealed trait YamlScalar extends YamlValue + case class YamlBlank() extends YamlScalar case class YamlEnum(v: String, i: Int) extends YamlScalar case class YamlInt(v: Int) extends YamlScalar case class YamlBool(v: Boolean) extends YamlScalar case class YamlString(v: String) extends YamlScalar - case class YamlList(v: Seq[YamlValue]) extends YamlValue + case class YamlList(v: Seq[YamlListElem]) extends YamlValue + case class YamlListElem(v: YamlValue) extends YamlValue object YamlMap: def apply(v: (String, YamlValue)*): YamlMap = YamlMap(v.to(mutable.LinkedHashMap)) @@ -24,12 +26,13 @@ object YamlDocBuilder: case class YamlMap(v: mutable.LinkedHashMap[String, YamlValue]) extends YamlValue - def build(root: YamlValue): String = + def build(root: YamlValue, indent: Int = 0): (String, Int) = val sb = new StringBuilder - build(root, sb, 0) - sb.toString + val maxIndent = build(root, sb, indent) + (sb.toString, maxIndent) - private def build(root: YamlValue, sb: StringBuilder, indent: Int): Unit = + private def build(root: YamlValue, sb: StringBuilder, indent: Int): Int = + var maxIndent = indent root match case YamlString(v) => sb.append(quoteAndEscape(v)) @@ -40,13 +43,14 @@ object YamlDocBuilder: case YamlEnum(v, i) => sb.append(f"${v} (${i})") case YamlList(v) => - sb.append(System.lineSeparator()) v.zipWithIndex.foreach { (e, index) => - sb.append(" " * indent).append("- ") - build(e, sb, indent + 1) - sb.append(System.lineSeparator()) + maxIndent = build(e, sb, indent) if e != v.last then sb.append(System.lineSeparator()) } + case YamlListElem(v) => + sb.append(System.lineSeparator()) + sb.append(" " * indent).append("- ") + maxIndent = build(v, sb, indent + 1) case YamlMap(v) => v.zipWithIndex.foreach { case ((k, e), ix) => if ix != 0 then sb.append(" " * indent) @@ -56,11 +60,13 @@ object YamlDocBuilder: // If a map nested inside a map we have to indent it properly sb.append(System.lineSeparator()) sb.append(" " * (indent + 1)) - build(e, sb, indent + 1) + maxIndent = build(e, sb, indent + 1) sb.append(System.lineSeparator()) - else build(e, sb, indent + 1) + else maxIndent = build(e, sb, indent + 1) if ix != v.size - 1 then sb.append(System.lineSeparator()) } + case YamlBlank() => () + maxIndent private def quoteAndEscape(s: String): String = "\"" + escape(s) + "\"" diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala index 78dad38..f2ab433 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfInspectSpec.scala @@ -50,7 +50,22 @@ class RdfInspectSpec extends AnyWordSpec with Matchers with TestFixtureHelper: frames.length should be(1) frames.map(_.get("triple_count")).sum should be(testCardinality) } - "throw an error if the input file is not a valid jelly file" in withEmptyJellyFile { j => + "handle properly frame count when aggregating multiple frames" in withFullJellyFile( + testCode = { j => + val (out, err) = RdfInspect.runTestCommand(List("rdf", "inspect", j)) + val yaml = new Yaml() + val parsed = yaml.load(out).asInstanceOf[java.util.Map[String, Any]] + parsed.get("stream_options") should not be None + val options = parsed.get("stream_options").asInstanceOf[java.util.Map[String, Any]] + options.get("max_name_table_size") should be(128) + parsed.get("frames") shouldBe a[util.LinkedHashMap[?, ?]] + val frames = parsed.get("frames").asInstanceOf[java.util.LinkedHashMap[String, Any]] + frames.get("triple_count") should be(testCardinality) + frames.get("frame_count") should be(5) + }, + frameSize = 15, + ) + "throw an error if the input file is not a valid Jelly file" in withEmptyJellyFile { j => val exception = intercept[ExitException] { RdfInspect.runTestCommand(List("rdf", "inspect", j, "--debug")) } From 0102f6d59cf2f6f2c6f56246ba637efe4647cfac Mon Sep 17 00:00:00 2001 From: Karolina Bogacka Date: Sat, 5 Apr 2025 21:46:13 +0200 Subject: [PATCH 6/6] Make the indent handling a little bit nicer --- .../jelly/cli/util/MetricsPrinter.scala | 13 ++-- .../jelly/cli/util/YamlDocBuilder.scala | 73 ++++++++++--------- 2 files changed, 46 insertions(+), 40 deletions(-) diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala b/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala index 0997a23..82f0d9d 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/MetricsPrinter.scala @@ -43,16 +43,17 @@ object MetricsPrinter: o: OutputStream, ): Unit = printOptions(options, o) - val (fullString, indent) = + val builder = YamlDocBuilder.build( YamlMap( "frames" -> YamlBlank(), ), ) + val fullString = builder.getString o.write(fullString.getBytes) iterator.foreach { frame => val yamlFrame = YamlListElem(formatStatsIndex(frame)) - val (fullString, _) = YamlDocBuilder.build(yamlFrame, indent) + val fullString = YamlDocBuilder.build(yamlFrame, builder.currIndent).getString o.write(fullString.getBytes) o.write(System.lineSeparator().getBytes) } @@ -64,12 +65,12 @@ object MetricsPrinter: ): Unit = { printOptions(options, o) val sumCounts = iterator.reduce((a, b) => a += b) - val (fullString, _) = + val fullString = YamlDocBuilder.build( YamlMap( "frames" -> formatStatsCount(sumCounts), ), - ) + ).getString o.write(fullString.getBytes) } @@ -78,12 +79,12 @@ object MetricsPrinter: o: OutputStream, ): Unit = val options = formatOptions(options = printOptions) - val (fullString, _) = + val fullString = YamlDocBuilder.build( YamlMap( "stream_options" -> options, ), - ) + ).getString o.write(fullString.getBytes) o.write(System.lineSeparator().getBytes) diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala index 10f3566..0f7f657 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/YamlDocBuilder.scala @@ -2,37 +2,14 @@ package eu.neverblink.jelly.cli.util import scala.collection.mutable -object YamlDocBuilder: - /** A lightweight YAML document builder based on - * https://github.com/RiverBench/ci-worker/blob/2d57a085f65a6eabbfe76f2de6794f025b211f4e/src/main/scala/util/YamlDocBuilder.scala#L4 - */ - - sealed trait YamlValue - sealed trait YamlScalar extends YamlValue - case class YamlBlank() extends YamlScalar - case class YamlEnum(v: String, i: Int) extends YamlScalar - case class YamlInt(v: Int) extends YamlScalar - case class YamlBool(v: Boolean) extends YamlScalar - case class YamlString(v: String) extends YamlScalar +class YamlDocBuilder(var currIndent: Int = 0): + import YamlDocBuilder.* + private val sb = new StringBuilder - case class YamlList(v: Seq[YamlListElem]) extends YamlValue - case class YamlListElem(v: YamlValue) extends YamlValue - - object YamlMap: - def apply(v: (String, YamlValue)*): YamlMap = YamlMap(v.to(mutable.LinkedHashMap)) - def apply(k: String, v: String): YamlMap = YamlMap(mutable.LinkedHashMap(k -> YamlString(v))) - def apply(k: String, v: Int): YamlMap = YamlMap(mutable.LinkedHashMap(k -> YamlInt(v))) - def apply(k: String, v: YamlValue): YamlMap = YamlMap(mutable.LinkedHashMap(k -> v)) - - case class YamlMap(v: mutable.LinkedHashMap[String, YamlValue]) extends YamlValue - - def build(root: YamlValue, indent: Int = 0): (String, Int) = - val sb = new StringBuilder - val maxIndent = build(root, sb, indent) - (sb.toString, maxIndent) + def getString: String = sb.toString - private def build(root: YamlValue, sb: StringBuilder, indent: Int): Int = - var maxIndent = indent + private def build(root: YamlValue, indent: Int = currIndent): Unit = + if indent > currIndent then currIndent = indent root match case YamlString(v) => sb.append(quoteAndEscape(v)) @@ -44,13 +21,13 @@ object YamlDocBuilder: sb.append(f"${v} (${i})") case YamlList(v) => v.zipWithIndex.foreach { (e, index) => - maxIndent = build(e, sb, indent) + this.build(e, indent) if e != v.last then sb.append(System.lineSeparator()) } case YamlListElem(v) => sb.append(System.lineSeparator()) sb.append(" " * indent).append("- ") - maxIndent = build(v, sb, indent + 1) + this.build(v, indent + 1) case YamlMap(v) => v.zipWithIndex.foreach { case ((k, e), ix) => if ix != 0 then sb.append(" " * indent) @@ -60,13 +37,41 @@ object YamlDocBuilder: // If a map nested inside a map we have to indent it properly sb.append(System.lineSeparator()) sb.append(" " * (indent + 1)) - maxIndent = build(e, sb, indent + 1) + this.build(e, indent + 1) sb.append(System.lineSeparator()) - else maxIndent = build(e, sb, indent + 1) + else this.build(e, indent + 1) if ix != v.size - 1 then sb.append(System.lineSeparator()) } case YamlBlank() => () - maxIndent + +object YamlDocBuilder: + /** A lightweight YAML document builder based on + * https://github.com/RiverBench/ci-worker/blob/2d57a085f65a6eabbfe76f2de6794f025b211f4e/src/main/scala/util/YamlDocBuilder.scala#L4 + */ + + sealed trait YamlValue + sealed trait YamlScalar extends YamlValue + case class YamlBlank() extends YamlScalar + case class YamlEnum(v: String, i: Int) extends YamlScalar + case class YamlInt(v: Int) extends YamlScalar + case class YamlBool(v: Boolean) extends YamlScalar + case class YamlString(v: String) extends YamlScalar + + case class YamlList(v: Seq[YamlListElem]) extends YamlValue + case class YamlListElem(v: YamlValue) extends YamlValue + + object YamlMap: + def apply(v: (String, YamlValue)*): YamlMap = YamlMap(v.to(mutable.LinkedHashMap)) + def apply(k: String, v: String): YamlMap = YamlMap(mutable.LinkedHashMap(k -> YamlString(v))) + def apply(k: String, v: Int): YamlMap = YamlMap(mutable.LinkedHashMap(k -> YamlInt(v))) + def apply(k: String, v: YamlValue): YamlMap = YamlMap(mutable.LinkedHashMap(k -> v)) + + case class YamlMap(v: mutable.LinkedHashMap[String, YamlValue]) extends YamlValue + + def build(root: YamlValue, indent: Int = 0): YamlDocBuilder = + val builder = YamlDocBuilder(currIndent = indent) + builder.build(root) + builder private def quoteAndEscape(s: String): String = "\"" + escape(s) + "\""