-
Notifications
You must be signed in to change notification settings - Fork 2
Add an inspect command that gathers metrics about a Jelly file #65
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
645f152
0c55333
da9dcfc
47300bc
4b06355
0102f6d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,4 +22,5 @@ object App extends CommandsEntryPoint: | |
| Version, | ||
| RdfFromJelly, | ||
| RdfToJelly, | ||
| RdfInspect, | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,93 @@ | ||
| package eu.neverblink.jelly.cli.command.rdf | ||
|
|
||
| import caseapp.{ExtraName, Recurse} | ||
| import caseapp.core.RemainingArgs | ||
| import eu.neverblink.jelly.cli.util.{FrameInfo, JellyUtil, MetricsPrinter} | ||
| import eu.neverblink.jelly.cli.* | ||
| import eu.ostrzyciel.jelly.core.proto.v1.* | ||
|
|
||
| import java.io.InputStream | ||
|
|
||
| case class RdfInspectOptions( | ||
| @Recurse | ||
| common: JellyCommandOptions = JellyCommandOptions(), | ||
| @ExtraName("to") outputFile: Option[String] = None, | ||
| @ExtraName("per-frame") perFrame: Boolean = false, | ||
| ) extends HasJellyCommandOptions | ||
|
|
||
| object RdfInspect extends JellyCommand[RdfInspectOptions]: | ||
|
|
||
| override def names: List[List[String]] = List( | ||
| List("rdf", "inspect"), | ||
| ) | ||
|
|
||
| override final def group = "rdf" | ||
|
|
||
| override def doRun(options: RdfInspectOptions, remainingArgs: RemainingArgs): Unit = | ||
| val (inputStream, outputStream) = | ||
| this.getIoStreamsFromOptions(remainingArgs.remaining.headOption, options.outputFile) | ||
| val printer = inspectJelly(inputStream) | ||
| if options.perFrame then printer.printPerFrame(outputStream) | ||
| else printer.printAggregate(outputStream) | ||
|
|
||
| private def inspectJelly( | ||
| inputStream: InputStream, | ||
| ): MetricsPrinter = | ||
|
|
||
| inline def computeMetrics( | ||
| frame: RdfStreamFrame, | ||
| frameIndex: Int, | ||
| printer: MetricsPrinter, | ||
| ): Unit = | ||
| val metrics = new FrameInfo() | ||
| frame.rows.foreach(r => metricsForRow(r, metrics)) | ||
| printer.frameInfo += metrics | ||
|
|
||
| try { | ||
| val allRows = JellyUtil.iterateRdfStream(inputStream).toList | ||
|
||
| // we need to check if the first frame contains options | ||
| val streamOptions = checkOptions(allRows) | ||
Karolina-Bogacka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| val printer = new MetricsPrinter(streamOptions) | ||
Karolina-Bogacka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| // We compute the metrics for each frame | ||
| // and then sum them all during the printing if desired | ||
| allRows.zipWithIndex.foreach { case (maybeFrame, frameIndex) => | ||
Karolina-Bogacka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| computeMetrics(maybeFrame, frameIndex, printer) | ||
| } | ||
| printer | ||
| } catch { | ||
| case e: Exception => | ||
| throw InvalidJellyFile(e) | ||
| } | ||
|
|
||
| private def metricsForRow( | ||
| row: RdfStreamRow, | ||
| metadata: FrameInfo, | ||
| ): Unit = | ||
| row.row match { | ||
| case r: RdfTriple => metadata.tripleCount += 1 | ||
| case r: RdfQuad => metadata.quadCount += 1 | ||
| case r: RdfNameEntry => metadata.nameCount += 1 | ||
| case r: RdfPrefixEntry => metadata.prefixCount += 1 | ||
| case r: RdfNamespaceDeclaration => metadata.namespaceCount += 1 | ||
| case r: RdfDatatypeEntry => metadata.datatypeCount += 1 | ||
| case r: RdfGraphStart => metadata.graphStartCount += 1 | ||
| case r: RdfGraphEnd => metadata.graphEndCount += 1 | ||
| case r: RdfStreamOptions => metadata.optionCount += 1 | ||
| } | ||
|
|
||
| /** Checks whether the first frame in the stream contains options and returns them. | ||
| * @param allFrames | ||
| * The list of all frames in the stream. | ||
| * @return | ||
| * The options from the first frame. | ||
| * @throws RuntimeException | ||
| * If the first frame does not contain options or if there are no frames in the stream. | ||
| */ | ||
| private def checkOptions(allFrames: List[RdfStreamFrame]): RdfStreamOptions = | ||
| if allFrames.isEmpty then throw new RuntimeException("No frames in the stream.") | ||
| if allFrames.head.rows.isEmpty then throw new RuntimeException("No rows in the frame.") | ||
| val frameRows = allFrames.head.rows | ||
| frameRows.head.row match { | ||
| case r: RdfStreamOptions => r | ||
| case _ => throw new RuntimeException("First row of the frame is not an options row.") | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| package eu.neverblink.jelly.cli.util | ||
|
|
||
| import eu.ostrzyciel.jelly.core.IoUtils | ||
| import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamFrame | ||
|
|
||
| import java.io.InputStream | ||
|
|
||
| object JellyUtil: | ||
| /** This method reads the Jelly file and returns an iterator of RdfStreamFrame | ||
| * | ||
| * @param inputStream | ||
| * @param outputStream | ||
| * @return | ||
| */ | ||
| def iterateRdfStream( | ||
| inputStream: InputStream, | ||
| ): Iterator[RdfStreamFrame] = | ||
| IoUtils.autodetectDelimiting(inputStream) match | ||
| case (false, newIn) => | ||
| // Non-delimited Jelly file | ||
| // In this case, we can only read one frame | ||
| Iterator(RdfStreamFrame.parseFrom(newIn)) | ||
| case (true, newIn) => | ||
| // Delimited Jelly file | ||
| // In this case, we can read multiple frames | ||
| Iterator.continually(RdfStreamFrame.parseDelimitedFrom(newIn)) | ||
| .takeWhile(_.isDefined).map(_.get) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,110 @@ | ||
| package eu.neverblink.jelly.cli.util | ||
|
|
||
| import eu.neverblink.jelly.cli.util.MetricsPrinter.{formatOptions, formatStats} | ||
| import eu.neverblink.jelly.cli.util.YamlDocBuilder.YamlMap | ||
| import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamOptions | ||
|
|
||
| import java.io.OutputStream | ||
| import scala.collection.mutable.ListBuffer | ||
|
|
||
| /** This class is used to store the metrics for a single frame | ||
| */ | ||
| class FrameInfo: | ||
Karolina-Bogacka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| var optionCount: Int = 0 | ||
| var nameCount: Int = 0 | ||
| var namespaceCount: Int = 0 | ||
| var tripleCount: Int = 0 | ||
| var quadCount: Int = 0 | ||
| var prefixCount: Int = 0 | ||
| var datatypeCount: Int = 0 | ||
| var graphStartCount: Int = 0 | ||
| var graphEndCount: Int = 0 | ||
|
|
||
| def +=(other: FrameInfo): FrameInfo = { | ||
| this.optionCount += other.optionCount | ||
| this.nameCount += other.nameCount | ||
| this.namespaceCount += other.namespaceCount | ||
| this.tripleCount += other.tripleCount | ||
| this.quadCount += other.quadCount | ||
| this.prefixCount += other.prefixCount | ||
| this.datatypeCount += other.datatypeCount | ||
| this.graphStartCount += other.graphStartCount | ||
| this.graphEndCount += other.graphEndCount | ||
| this | ||
| } | ||
|
|
||
| end FrameInfo | ||
|
|
||
| class MetricsPrinter(printOptions: RdfStreamOptions): | ||
Karolina-Bogacka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| var frameInfo: ListBuffer[FrameInfo] = ListBuffer.empty | ||
Karolina-Bogacka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| def printPerFrame(o: OutputStream): Unit = { | ||
| val options = formatOptions(options = printOptions) | ||
| val yamlFrames = YamlDocBuilder.YamlList(frameInfo.map { frame => | ||
| formatStats(frame) | ||
| }.toSeq) | ||
Karolina-Bogacka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| val fullString = | ||
| YamlDocBuilder.build( | ||
| YamlMap( | ||
| "stream_options" -> options, | ||
| "frames" -> yamlFrames, | ||
| ), | ||
| ) | ||
| o.write(fullString.getBytes) | ||
|
|
||
| } | ||
|
|
||
| def printAggregate(o: OutputStream): Unit = { | ||
| val sumCounts = frameInfo.reduce(_ += _) | ||
| val options = formatOptions(options = printOptions) | ||
| val fullString = | ||
| YamlDocBuilder.build( | ||
| YamlMap( | ||
| "stream_options" -> options, | ||
| "frames" -> formatStats(sumCounts), | ||
| ), | ||
| ) | ||
| o.write(fullString.getBytes) | ||
| } | ||
|
|
||
| end MetricsPrinter | ||
|
|
||
| object MetricsPrinter: | ||
|
|
||
| /** This method converts a boolean to an integer | ||
| */ | ||
| private def boolToInt(b: Boolean): Int = | ||
| if b then 1 else 0 | ||
|
|
||
| def formatOptions( | ||
| options: RdfStreamOptions, | ||
| ): YamlDocBuilder.YamlMap = | ||
Karolina-Bogacka marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| YamlDocBuilder.YamlMap( | ||
| "rdf_star" -> YamlDocBuilder.YamlInt(boolToInt(options.rdfStar)), | ||
| "stream_name" -> YamlDocBuilder.YamlString(options.streamName), | ||
| "generalized_statements" -> YamlDocBuilder.YamlInt(boolToInt(options.generalizedStatements)), | ||
| "version" -> YamlDocBuilder.YamlInt(options.version), | ||
| "max_datatype_table_size" -> YamlDocBuilder.YamlInt(options.maxDatatypeTableSize), | ||
| "max_name_table_size" -> YamlDocBuilder.YamlInt(options.maxNameTableSize), | ||
| "max_prefix_table_size" -> YamlDocBuilder.YamlInt(options.maxPrefixTableSize), | ||
| "logical_type" -> YamlDocBuilder.YamlInt(options.logicalType.value), | ||
| "physical_type" -> YamlDocBuilder.YamlInt(options.physicalType.value), | ||
| ) | ||
|
|
||
| def formatStats( | ||
| frame: FrameInfo, | ||
| ): YamlDocBuilder.YamlMap = | ||
| YamlDocBuilder.YamlMap( | ||
| "option_count" -> YamlDocBuilder.YamlInt(frame.optionCount), | ||
| "name_count" -> YamlDocBuilder.YamlInt(frame.nameCount), | ||
| "namespace_count" -> YamlDocBuilder.YamlInt(frame.namespaceCount), | ||
| "triple_count" -> YamlDocBuilder.YamlInt(frame.tripleCount), | ||
| "quad_count" -> YamlDocBuilder.YamlInt(frame.quadCount), | ||
| "prefix_count" -> YamlDocBuilder.YamlInt(frame.prefixCount), | ||
| "datatype_count" -> YamlDocBuilder.YamlInt(frame.datatypeCount), | ||
| "graph_start_count" -> YamlDocBuilder.YamlInt(frame.graphStartCount), | ||
| "graph_end_count" -> YamlDocBuilder.YamlInt(frame.graphEndCount), | ||
| ) | ||
|
|
||
| end MetricsPrinter | ||
Uh oh!
There was an error while loading. Please reload this page.