diff --git a/src/main/scala/eu/neverblink/jelly/cli/App.scala b/src/main/scala/eu/neverblink/jelly/cli/App.scala index ce5efc0..ba236d4 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/App.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/App.scala @@ -3,6 +3,7 @@ package eu.neverblink.jelly.cli import caseapp.* import eu.neverblink.jelly.cli.command.* import eu.neverblink.jelly.cli.command.rdf.* +import eu.neverblink.jelly.cli.util.jena.riot.CliRiot import org.apache.jena.sys.JenaSystem /** Main entrypoint. @@ -11,6 +12,8 @@ object App extends CommandsEntryPoint: // Initialize Jena now to avoid race conditions later JenaSystem.init() + // Initialize the CLI Riot parsers + CliRiot.initialize() override def enableCompletionsCommand: Boolean = true diff --git a/src/main/scala/eu/neverblink/jelly/cli/JellyCommand.scala b/src/main/scala/eu/neverblink/jelly/cli/JellyCommand.scala index 68f6d30..4e3ad79 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/JellyCommand.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/JellyCommand.scala @@ -19,9 +19,9 @@ abstract class JellyCommand[T <: HasJellyCommandOptions: {Parser, Help}] extends private var isTest = false private var options: Option[T] = None - final protected[cli] var out = System.out - final protected[cli] var err = System.err - final protected[cli] var in = System.in + final protected[cli] var out: PrintStream = System.out + final protected[cli] var err: PrintStream = System.err + final protected[cli] var in: InputStream = System.in private var osOut: ByteArrayOutputStream = uninitialized private var osErr: ByteArrayOutputStream = uninitialized @@ -119,7 +119,7 @@ abstract class JellyCommand[T <: HasJellyCommandOptions: {Parser, Help}] extends if isTest then in else System.in - final def setStdIn(data: ByteArrayInputStream): Unit = + final def setStdIn(data: InputStream): Unit = in = data final def getOutStream: OutputStream = diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/CliRiot.scala b/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/CliRiot.scala new file mode 100644 index 0000000..90407e8 --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/CliRiot.scala @@ -0,0 +1,77 @@ +package eu.neverblink.jelly.cli.util.jena.riot + +import org.apache.jena.atlas.web.ContentType +import org.apache.jena.riot.{RDFLanguages, RDFParserRegistry, ReaderRIOT} +import org.apache.jena.riot.lang.{LangRIOT, RiotParsers} +import org.apache.jena.riot.system.{ParserProfile, StreamRDF} +import org.apache.jena.riot.tokens.{Tokenizer, TokenizerText} +import org.apache.jena.sparql.util.Context + +import java.io.{InputStream, Reader} + +/** Registration utilities for jelly-cli's overrides of Apache Jena's Riot components (e.g., + * parsers). + * + * The initialize() method must be called before using any of the parsers, right after + * JenaSystem.init(). + */ +object CliRiot: + private var initialized = false + + def initialize(): Unit = CliRiot.synchronized { + if initialized then return + RiotParsers.factoryNT = (_, parserProfile) => NTriplesReader(parserProfile) + RiotParsers.factoryNQ = (_, parserProfile) => NQuadsReader(parserProfile) + RDFParserRegistry.registerLangTriples(RDFLanguages.NTRIPLES, RiotParsers.factoryNT) + RDFParserRegistry.registerLangQuads(RDFLanguages.NQUADS, RiotParsers.factoryNQ) + initialized = true + } + + /** Base reader for parsing N-Triples and N-Quads. Heavily inspired by the Jena Riot code: + * https://github.com/apache/jena/blob/bd97ad4cf731ade857926787dd2df735644a354b/jena-arq/src/main/java/org/apache/jena/riot/lang/RiotParsers.java + * @param parserProfile + * parser profile + */ + private abstract class BaseReader(parserProfile: ParserProfile) extends ReaderRIOT: + def create(tokenizer: Tokenizer, output: StreamRDF, context: Context): LangRIOT + + final def read( + in: InputStream, + baseURI: String, + ct: ContentType, + output: StreamRDF, + context: Context, + ): Unit = + val tok = TokenizerText.create() + .source(in) + .errorHandler(parserProfile.getErrorHandler) + .build() + create(tok, output, context).parse() + + final def read( + reader: Reader, + baseURI: String, + ct: ContentType, + output: StreamRDF, + context: Context, + ): Unit = + val tok = TokenizerText.create() + .source(reader) + .errorHandler(parserProfile.getErrorHandler) + .build() + create(tok, output, context).parse() + + private final class NTriplesReader(parserProfile: ParserProfile) + extends BaseReader(parserProfile): + override def create( + tokenizer: Tokenizer, + output: StreamRDF, + context: Context, + ): LangRIOT = new LangNTriplesGeneralized(tokenizer, parserProfile, output) + + private final class NQuadsReader(parserProfile: ParserProfile) extends BaseReader(parserProfile): + override def create( + tokenizer: Tokenizer, + output: StreamRDF, + context: Context, + ): LangRIOT = new LangNQuadsGeneralized(tokenizer, parserProfile, output) diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/LangNQuadsGeneralized.scala b/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/LangNQuadsGeneralized.scala new file mode 100644 index 0000000..2415066 --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/LangNQuadsGeneralized.scala @@ -0,0 +1,51 @@ +package eu.neverblink.jelly.cli.util.jena.riot + +import org.apache.jena.graph.Node +import org.apache.jena.riot.system.{ParserProfile, StreamRDF} +import org.apache.jena.riot.tokens.{Token, TokenType, Tokenizer} +import org.apache.jena.riot.{Lang, RDFLanguages} +import org.apache.jena.sparql.core.Quad + +/** Parser for generalized N-Quads. Heavily inspired by the Jena Riot code: + * https://github.com/apache/jena/blob/bd97ad4cf731ade857926787dd2df735644a354b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNQuads.java + */ +final class LangNQuadsGeneralized(tokens: Tokenizer, profile: ParserProfile, dest: StreamRDF) + extends LangNTupleGeneralized[Quad](tokens, profile, dest): + + // Null for no graph. + private var currentGraph: Node = null + + override def getLang: Lang = RDFLanguages.NQUADS + + /** Method to parse the whole stream of triples, sending each to the sink */ + override protected def runParser(): Unit = + while (hasNext) { + val x = parseOne + if (x != null) dest.quad(x) + } + + override protected def parseOne: Quad = + val sToken = nextToken + val s = parseNode(sToken) + val p = parseNode(nextToken) + val o = parseNode(nextToken) + var xToken = nextToken // Maybe DOT + if (xToken.getType eq TokenType.EOF) + exception(xToken, "Premature end of file: Quad not terminated by DOT: %s", xToken) + // Process graph node first, before S,P,O + // to set bnode label scope (if not global) + var c: Node = null + if (xToken.getType ne TokenType.DOT) { + c = parseNode(xToken) + xToken = nextToken + currentGraph = c + } else { + c = Quad.defaultGraphNodeGenerated + currentGraph = null + } + // Check end of quad + if (xToken.getType ne TokenType.DOT) exception(xToken, "Quad not terminated by DOT: %s", xToken) + profile.createQuad(c, s, p, o, sToken.getLine, sToken.getColumn) + + override protected def tokenAsNode(token: Token): Node = + profile.create(currentGraph, token) diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/LangNTriplesGeneralized.scala b/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/LangNTriplesGeneralized.scala new file mode 100644 index 0000000..8a8bf4f --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/LangNTriplesGeneralized.scala @@ -0,0 +1,30 @@ +package eu.neverblink.jelly.cli.util.jena.riot + +import org.apache.jena.graph.{Node, Triple} +import org.apache.jena.riot.system.{ParserProfile, StreamRDF} +import org.apache.jena.riot.tokens.{Token, TokenType, Tokenizer} +import org.apache.jena.riot.{Lang, RDFLanguages} + +/** Parser for generalized N-Triples. Heavily inspired by the Jena Riot code: + * https://github.com/apache/jena/blob/bd97ad4cf731ade857926787dd2df735644a354b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java + */ +final class LangNTriplesGeneralized(tokens: Tokenizer, profile: ParserProfile, dest: StreamRDF) + extends LangNTupleGeneralized[Triple](tokens, profile, dest): + + override def getLang: Lang = RDFLanguages.NTRIPLES + + /** Method to parse the whole stream of triples, sending each to the sink */ + override protected def runParser(): Unit = + while (hasNext) { + val x = parseOne + if (x != null) dest.triple(x) + } + + override protected def parseOne: Triple = + val triple = parseTripleGeneralized + val x = nextToken + if (x.getType ne TokenType.DOT) exception(x, "Triple not terminated by DOT: %s", x) + triple + + override protected def tokenAsNode(token: Token): Node = + profile.create(null, token) diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/LangNTupleGeneralized.scala b/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/LangNTupleGeneralized.scala new file mode 100644 index 0000000..3c0f995 --- /dev/null +++ b/src/main/scala/eu/neverblink/jelly/cli/util/jena/riot/LangNTupleGeneralized.scala @@ -0,0 +1,32 @@ +package eu.neverblink.jelly.cli.util.jena.riot + +import org.apache.jena.graph.{Node, NodeFactory, Triple} +import org.apache.jena.riot.lang.LangNTuple +import org.apache.jena.riot.system.{ParserProfile, StreamRDF} +import org.apache.jena.riot.tokens.{Token, TokenType, Tokenizer} + +/** Base class for parsing N-Triples and N-Quads. Heavily inspired by the Jena Riot code: + * https://github.com/apache/jena/blob/bd97ad4cf731ade857926787dd2df735644a354b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTuple.java + */ +abstract class LangNTupleGeneralized[T](tokens: Tokenizer, profile: ParserProfile, dest: StreamRDF) + extends LangNTuple[T](tokens, profile, dest): + + protected final def parseNode(token: Token): Node = + if (token.isEOF) exception(token, "Premature end of file: %s", token) + if (token.hasType(TokenType.LT2)) parseTripleTermGeneralized + else + checkRDFTerm(token) + tokenAsNode(token) + + protected final def parseTripleGeneralized: Triple = + val sToken = nextToken + val s = parseNode(sToken) + val p = parseNode(nextToken) + val o = parseNode(nextToken) + profile.createTriple(s, p, o, sToken.getLine, sToken.getColumn) + + protected final def parseTripleTermGeneralized: Node = + val t = parseTripleGeneralized + val x = nextToken + if (x.getType ne TokenType.GT2) exception(x, "Triple term not terminated by >>: %s", x) + NodeFactory.createTripleNode(t) diff --git a/src/test/resources/generalized.nq b/src/test/resources/generalized.nq new file mode 100644 index 0000000..c4f17ac --- /dev/null +++ b/src/test/resources/generalized.nq @@ -0,0 +1,8 @@ + _:b1 . +"Resource 1" . + "Property Label" . +_:b1 << _:b1 _:b2 _:b3 >> . + _:b1 _:b1 . +"Resource 1" "literal graph"^^ . + "Property Label" . +_:b1 << _:b1 _:b2 _:b3 >> "literal"@en . diff --git a/src/test/resources/generalized.nt b/src/test/resources/generalized.nt new file mode 100644 index 0000000..ea10e54 --- /dev/null +++ b/src/test/resources/generalized.nt @@ -0,0 +1,4 @@ + _:b1 . +"Resource 1" . + "Property Label" . +_:b1 << _:b1 _:b2 _:b3 >> . diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/helpers/TestFixtureHelper.scala b/src/test/scala/eu/neverblink/jelly/cli/command/helpers/TestFixtureHelper.scala index 1570391..f17a522 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/helpers/TestFixtureHelper.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/helpers/TestFixtureHelper.scala @@ -1,5 +1,6 @@ package eu.neverblink.jelly.cli.command.helpers +import eu.neverblink.jelly.cli.util.jena.riot.CliRiot import eu.ostrzyciel.jelly.convert.jena.riot.{JellyFormatVariant, JellyLanguage} import org.apache.jena.riot.{Lang, RDFDataMgr, RDFFormat, RDFLanguages} import org.apache.jena.sys.JenaSystem @@ -18,6 +19,7 @@ trait TestFixtureHelper extends BeforeAndAfterAll: TestFixtureHelper.synchronized { JenaSystem.init() + CliRiot.initialize() } private val tmpDir: Path = Files.createTempDirectory("jelly-cli") diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala index ba86faa..d1735b8 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfToJellySpec.scala @@ -8,6 +8,7 @@ import eu.ostrzyciel.jelly.core.proto.v1.{LogicalStreamType, RdfStreamFrame} import eu.ostrzyciel.jelly.core.{IoUtils, JellyOptions} import org.apache.jena.rdf.model.{Model, ModelFactory} import org.apache.jena.riot.{RDFLanguages, RDFParser} +import org.apache.jena.sparql.core.DatasetGraphFactory import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec @@ -72,6 +73,30 @@ class RdfToJellySpec extends AnyWordSpec with TestFixtureHelper with Matchers: content.containsAll(tripleModel.listStatements()) } + "input stream to output stream, generalized RDF (N-Triples)" in { + val inputStream = new FileInputStream(getClass.getResource("/generalized.nt").getPath) + RdfToJelly.setStdIn(inputStream) + val (out, err) = RdfToJelly.runTestCommand( + List("rdf", "to-jelly", "--in-format=nt"), + ) + val newIn = new ByteArrayInputStream(RdfToJelly.getOutBytes) + val content = translateJellyBack(newIn) + content.size() should be(4) + } + + "input stream to output stream, generalized RDF (N-Quads)" in { + val inputStream = new FileInputStream(getClass.getResource("/generalized.nq").getPath) + RdfToJelly.setStdIn(inputStream) + val (out, err) = RdfToJelly.runTestCommand( + List("rdf", "to-jelly", "--in-format=nq"), + ) + val newIn = new ByteArrayInputStream(RdfToJelly.getOutBytes) + val ds = DatasetGraphFactory.create() + RDFParser.source(newIn).lang(JellyLanguage.JELLY).parse(ds) + ds.size() should be(4) // 4 named graphs + ds.getDefaultGraph.size() should be(4) // 4 triples in the default graph + } + "an input stream to file" in withEmptyJellyFile { j => val input = DataGenHelper.generateJenaInputStream(testCardinality) RdfToJelly.setStdIn(input) @@ -211,6 +236,7 @@ class RdfToJellySpec extends AnyWordSpec with TestFixtureHelper with Matchers: } } } + "handle conversion of other formats to Jelly" when { "NTriples" in { val input = DataGenHelper.generateJenaInputStream(testCardinality, RDFLanguages.NTRIPLES) @@ -385,6 +411,7 @@ class RdfToJellySpec extends AnyWordSpec with TestFixtureHelper with Matchers: } } } + "throw proper exception" when { "invalid format is specified" in withFullJenaFile { f => val e =