-
Notifications
You must be signed in to change notification settings - Fork 2
rdf to-jelly: support generalized RDF input #98
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| package eu.neverblink.jelly.cli.util.jena.riot | ||
|
|
||
| import org.apache.jena.atlas.web.ContentType | ||
| import org.apache.jena.riot.{RDFLanguages, RDFParserRegistry, ReaderRIOT} | ||
| import org.apache.jena.riot.lang.{LangRIOT, RiotParsers} | ||
| import org.apache.jena.riot.system.{ParserProfile, StreamRDF} | ||
| import org.apache.jena.riot.tokens.{Tokenizer, TokenizerText} | ||
| import org.apache.jena.sparql.util.Context | ||
|
|
||
| import java.io.{InputStream, Reader} | ||
|
|
||
| /** Registration utilities for jelly-cli's overrides of Apache Jena's Riot components (e.g., | ||
| * parsers). | ||
| * | ||
| * The initialize() method must be called before using any of the parsers, right after | ||
| * JenaSystem.init(). | ||
| */ | ||
| object CliRiot: | ||
| private var initialized = false | ||
|
|
||
| def initialize(): Unit = CliRiot.synchronized { | ||
| if initialized then return | ||
| RiotParsers.factoryNT = (_, parserProfile) => NTriplesReader(parserProfile) | ||
| RiotParsers.factoryNQ = (_, parserProfile) => NQuadsReader(parserProfile) | ||
| RDFParserRegistry.registerLangTriples(RDFLanguages.NTRIPLES, RiotParsers.factoryNT) | ||
| RDFParserRegistry.registerLangQuads(RDFLanguages.NQUADS, RiotParsers.factoryNQ) | ||
| initialized = true | ||
| } | ||
|
|
||
| /** Base reader for parsing N-Triples and N-Quads. Heavily inspired by the Jena Riot code: | ||
| * https://github.com/apache/jena/blob/bd97ad4cf731ade857926787dd2df735644a354b/jena-arq/src/main/java/org/apache/jena/riot/lang/RiotParsers.java | ||
| * @param parserProfile | ||
| * parser profile | ||
| */ | ||
| private abstract class BaseReader(parserProfile: ParserProfile) extends ReaderRIOT: | ||
| def create(tokenizer: Tokenizer, output: StreamRDF, context: Context): LangRIOT | ||
|
|
||
| final def read( | ||
| in: InputStream, | ||
| baseURI: String, | ||
| ct: ContentType, | ||
| output: StreamRDF, | ||
| context: Context, | ||
| ): Unit = | ||
| val tok = TokenizerText.create() | ||
| .source(in) | ||
| .errorHandler(parserProfile.getErrorHandler) | ||
| .build() | ||
| create(tok, output, context).parse() | ||
|
|
||
| final def read( | ||
| reader: Reader, | ||
| baseURI: String, | ||
| ct: ContentType, | ||
| output: StreamRDF, | ||
| context: Context, | ||
| ): Unit = | ||
| val tok = TokenizerText.create() | ||
| .source(reader) | ||
| .errorHandler(parserProfile.getErrorHandler) | ||
| .build() | ||
| create(tok, output, context).parse() | ||
|
|
||
| private final class NTriplesReader(parserProfile: ParserProfile) | ||
| extends BaseReader(parserProfile): | ||
| override def create( | ||
| tokenizer: Tokenizer, | ||
| output: StreamRDF, | ||
| context: Context, | ||
| ): LangRIOT = new LangNTriplesGeneralized(tokenizer, parserProfile, output) | ||
|
|
||
| private final class NQuadsReader(parserProfile: ParserProfile) extends BaseReader(parserProfile): | ||
| override def create( | ||
| tokenizer: Tokenizer, | ||
| output: StreamRDF, | ||
| context: Context, | ||
| ): LangRIOT = new LangNQuadsGeneralized(tokenizer, parserProfile, output) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| package eu.neverblink.jelly.cli.util.jena.riot | ||
|
|
||
| import org.apache.jena.graph.Node | ||
| import org.apache.jena.riot.system.{ParserProfile, StreamRDF} | ||
| import org.apache.jena.riot.tokens.{Token, TokenType, Tokenizer} | ||
| import org.apache.jena.riot.{Lang, RDFLanguages} | ||
| import org.apache.jena.sparql.core.Quad | ||
|
|
||
| /** Parser for generalized N-Quads. Heavily inspired by the Jena Riot code: | ||
| * https://github.com/apache/jena/blob/bd97ad4cf731ade857926787dd2df735644a354b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNQuads.java | ||
| */ | ||
| final class LangNQuadsGeneralized(tokens: Tokenizer, profile: ParserProfile, dest: StreamRDF) | ||
| extends LangNTupleGeneralized[Quad](tokens, profile, dest): | ||
|
|
||
| // Null for no graph. | ||
| private var currentGraph: Node = null | ||
|
|
||
| override def getLang: Lang = RDFLanguages.NQUADS | ||
|
|
||
| /** Method to parse the whole stream of triples, sending each to the sink */ | ||
| override protected def runParser(): Unit = | ||
| while (hasNext) { | ||
| val x = parseOne | ||
| if (x != null) dest.quad(x) | ||
| } | ||
|
|
||
| override protected def parseOne: Quad = | ||
| val sToken = nextToken | ||
| val s = parseNode(sToken) | ||
| val p = parseNode(nextToken) | ||
| val o = parseNode(nextToken) | ||
| var xToken = nextToken // Maybe DOT | ||
| if (xToken.getType eq TokenType.EOF) | ||
| exception(xToken, "Premature end of file: Quad not terminated by DOT: %s", xToken) | ||
| // Process graph node first, before S,P,O | ||
| // to set bnode label scope (if not global) | ||
| var c: Node = null | ||
| if (xToken.getType ne TokenType.DOT) { | ||
| c = parseNode(xToken) | ||
| xToken = nextToken | ||
| currentGraph = c | ||
| } else { | ||
| c = Quad.defaultGraphNodeGenerated | ||
| currentGraph = null | ||
| } | ||
| // Check end of quad | ||
| if (xToken.getType ne TokenType.DOT) exception(xToken, "Quad not terminated by DOT: %s", xToken) | ||
| profile.createQuad(c, s, p, o, sToken.getLine, sToken.getColumn) | ||
|
|
||
| override protected def tokenAsNode(token: Token): Node = | ||
| profile.create(currentGraph, token) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| package eu.neverblink.jelly.cli.util.jena.riot | ||
|
|
||
| import org.apache.jena.graph.{Node, Triple} | ||
| import org.apache.jena.riot.system.{ParserProfile, StreamRDF} | ||
| import org.apache.jena.riot.tokens.{Token, TokenType, Tokenizer} | ||
| import org.apache.jena.riot.{Lang, RDFLanguages} | ||
|
|
||
| /** Parser for generalized N-Triples. Heavily inspired by the Jena Riot code: | ||
| * https://github.com/apache/jena/blob/bd97ad4cf731ade857926787dd2df735644a354b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTriples.java | ||
| */ | ||
| final class LangNTriplesGeneralized(tokens: Tokenizer, profile: ParserProfile, dest: StreamRDF) | ||
| extends LangNTupleGeneralized[Triple](tokens, profile, dest): | ||
|
|
||
| override def getLang: Lang = RDFLanguages.NTRIPLES | ||
|
|
||
| /** Method to parse the whole stream of triples, sending each to the sink */ | ||
| override protected def runParser(): Unit = | ||
| while (hasNext) { | ||
| val x = parseOne | ||
| if (x != null) dest.triple(x) | ||
| } | ||
|
|
||
| override protected def parseOne: Triple = | ||
| val triple = parseTripleGeneralized | ||
| val x = nextToken | ||
| if (x.getType ne TokenType.DOT) exception(x, "Triple not terminated by DOT: %s", x) | ||
| triple | ||
|
|
||
| override protected def tokenAsNode(token: Token): Node = | ||
| profile.create(null, token) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| package eu.neverblink.jelly.cli.util.jena.riot | ||
|
|
||
| import org.apache.jena.graph.{Node, NodeFactory, Triple} | ||
| import org.apache.jena.riot.lang.LangNTuple | ||
| import org.apache.jena.riot.system.{ParserProfile, StreamRDF} | ||
| import org.apache.jena.riot.tokens.{Token, TokenType, Tokenizer} | ||
|
|
||
| /** Base class for parsing N-Triples and N-Quads. Heavily inspired by the Jena Riot code: | ||
| * https://github.com/apache/jena/blob/bd97ad4cf731ade857926787dd2df735644a354b/jena-arq/src/main/java/org/apache/jena/riot/lang/LangNTuple.java | ||
| */ | ||
| abstract class LangNTupleGeneralized[T](tokens: Tokenizer, profile: ParserProfile, dest: StreamRDF) | ||
| extends LangNTuple[T](tokens, profile, dest): | ||
|
|
||
| protected final def parseNode(token: Token): Node = | ||
| if (token.isEOF) exception(token, "Premature end of file: %s", token) | ||
| if (token.hasType(TokenType.LT2)) parseTripleTermGeneralized | ||
| else | ||
| checkRDFTerm(token) | ||
| tokenAsNode(token) | ||
|
|
||
| protected final def parseTripleGeneralized: Triple = | ||
| val sToken = nextToken | ||
| val s = parseNode(sToken) | ||
| val p = parseNode(nextToken) | ||
| val o = parseNode(nextToken) | ||
| profile.createTriple(s, p, o, sToken.getLine, sToken.getColumn) | ||
|
|
||
| protected final def parseTripleTermGeneralized: Node = | ||
| val t = parseTripleGeneralized | ||
| val x = nextToken | ||
| if (x.getType ne TokenType.GT2) exception(x, "Triple term not terminated by >>: %s", x) | ||
| NodeFactory.createTripleNode(t) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| <http://example.org/resource/r1> _:b1 <http://example.org/resource/r2> . | ||
| "Resource 1" <http://example.org/property/p> <http://example.org/resource/r3> . | ||
| <http://example.org/resource/r3> "Property Label" <http://example.org/resource/r1> . | ||
| _:b1 << _:b1 _:b2 _:b3 >> <http://example.org/resource/r4> . | ||
| <http://example.org/resource/r1> _:b1 <http://example.org/resource/r2> _:b1 . | ||
| "Resource 1" <http://example.org/property/p> <http://example.org/resource/r3> "literal graph"^^<http://example.org> . | ||
| <http://example.org/resource/r3> "Property Label" <http://example.org/resource/r1> <http://example.org> . | ||
| _:b1 << _:b1 _:b2 _:b3 >> <http://example.org/resource/r4> "literal"@en . |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| <http://example.org/resource/r1> _:b1 <http://example.org/resource/r2> . | ||
| "Resource 1" <http://example.org/property/p> <http://example.org/resource/r3> . | ||
| <http://example.org/resource/r3> "Property Label" <http://example.org/resource/r1> . | ||
| _:b1 << _:b1 _:b2 _:b3 >> <http://example.org/resource/r4> . |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,6 +8,7 @@ import eu.ostrzyciel.jelly.core.proto.v1.{LogicalStreamType, RdfStreamFrame} | |
| import eu.ostrzyciel.jelly.core.{IoUtils, JellyOptions} | ||
| import org.apache.jena.rdf.model.{Model, ModelFactory} | ||
| import org.apache.jena.riot.{RDFLanguages, RDFParser} | ||
| import org.apache.jena.sparql.core.DatasetGraphFactory | ||
| import org.scalatest.matchers.should.Matchers | ||
| import org.scalatest.wordspec.AnyWordSpec | ||
|
|
||
|
|
@@ -72,6 +73,30 @@ class RdfToJellySpec extends AnyWordSpec with TestFixtureHelper with Matchers: | |
| content.containsAll(tripleModel.listStatements()) | ||
| } | ||
|
|
||
| "input stream to output stream, generalized RDF (N-Triples)" in { | ||
| val inputStream = new FileInputStream(getClass.getResource("/generalized.nt").getPath) | ||
| RdfToJelly.setStdIn(inputStream) | ||
| val (out, err) = RdfToJelly.runTestCommand( | ||
| List("rdf", "to-jelly", "--in-format=nt"), | ||
| ) | ||
| val newIn = new ByteArrayInputStream(RdfToJelly.getOutBytes) | ||
| val content = translateJellyBack(newIn) | ||
| content.size() should be(4) | ||
| } | ||
|
|
||
| "input stream to output stream, generalized RDF (N-Quads)" in { | ||
| val inputStream = new FileInputStream(getClass.getResource("/generalized.nq").getPath) | ||
| RdfToJelly.setStdIn(inputStream) | ||
| val (out, err) = RdfToJelly.runTestCommand( | ||
| List("rdf", "to-jelly", "--in-format=nq"), | ||
| ) | ||
| val newIn = new ByteArrayInputStream(RdfToJelly.getOutBytes) | ||
| val ds = DatasetGraphFactory.create() | ||
| RDFParser.source(newIn).lang(JellyLanguage.JELLY).parse(ds) | ||
| ds.size() should be(4) // 4 named graphs | ||
| ds.getDefaultGraph.size() should be(4) // 4 triples in the default graph | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this test seems kind of roundabout to me. Why do we have a separate test nq file only to check at the end the number of quads? wouldn't it make more sense to create an nq file dynamically from a set of statements and then compare the final graph to the first set of statements, or try to translate the above graph to nquad and compare the two .nq files/strings?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The point is to just check if the parser works. Generating this dynamically is a pain, I'd prefer to have it in a file. |
||
| } | ||
|
|
||
| "an input stream to file" in withEmptyJellyFile { j => | ||
| val input = DataGenHelper.generateJenaInputStream(testCardinality) | ||
| RdfToJelly.setStdIn(input) | ||
|
|
@@ -211,6 +236,7 @@ class RdfToJellySpec extends AnyWordSpec with TestFixtureHelper with Matchers: | |
| } | ||
| } | ||
| } | ||
|
|
||
| "handle conversion of other formats to Jelly" when { | ||
| "NTriples" in { | ||
| val input = DataGenHelper.generateJenaInputStream(testCardinality, RDFLanguages.NTRIPLES) | ||
|
|
@@ -385,6 +411,7 @@ class RdfToJellySpec extends AnyWordSpec with TestFixtureHelper with Matchers: | |
| } | ||
| } | ||
| } | ||
|
|
||
| "throw proper exception" when { | ||
| "invalid format is specified" in withFullJenaFile { f => | ||
| val e = | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why value names like x or t, this is not very informative
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's the original names from Jena... let's go with that for now.