diff --git a/src/main/java/gr/forth/ics/isl/x3ml/engine/GeneratorContext.java b/src/main/java/gr/forth/ics/isl/x3ml/engine/GeneratorContext.java index cc9130e..629536f 100644 --- a/src/main/java/gr/forth/ics/isl/x3ml/engine/GeneratorContext.java +++ b/src/main/java/gr/forth/ics/isl/x3ml/engine/GeneratorContext.java @@ -19,6 +19,7 @@ Licensed to the Apache Software Foundation (ASF) under one package gr.forth.ics.isl.x3ml.engine; import gr.forth.ics.isl.x3ml.X3MLEngine; +import gr.forth.ics.isl.x3ml.engine.X3ML.GeneratedType; import org.w3c.dom.Node; import static gr.forth.ics.isl.x3ml.engine.X3ML.ArgValue; import static gr.forth.ics.isl.x3ml.engine.X3ML.Condition; @@ -32,8 +33,11 @@ Licensed to the Apache Software Foundation (ASF) under one import java.io.OutputStreamWriter; import java.io.Writer; import java.util.ArrayDeque; +import java.util.ArrayList; import java.util.Deque; import java.util.HashMap; +import java.util.List; +import java.util.regex.Pattern; import org.w3c.dom.Attr; import static gr.forth.ics.isl.x3ml.X3MLEngine.exception; @@ -170,9 +174,7 @@ public ArgValue getArgValue(String name, SourceType sourceType, boolean mergeMul }); put(variable_deprecated,VariableScope.WITHIN_MAPPING, generatedValue); context.putGeneratedValue(extractXPath(node) + unique+"-"+variable, generatedValue); - if(X3MLEngine.ENABLE_ASSOCIATION_TABLE){ - this.createAssociationTable(generatedValue, null, extractAssocTableXPath(node)); - } + this.createAssociationTable(generatedValue, generator, node); } }else{ // String nodeName = extractXPath(node) + unique+"-"+typeAwareVar; @@ -197,21 +199,9 @@ public ArgValue getArgValue(String name, SourceType sourceType, boolean mergeMul } } }); - GeneratedValue genArg=null; - if(generator.getName().equalsIgnoreCase("Literal")){ - genArg = context.policy().generate(generator, new Generator.ArgValues() { - @Override - public ArgValue getArgValue(String name, SourceType sourceType, boolean mergeMultipleValues) { - return context.input().evaluateArgument2(node, index, generator, name, sourceType); - - } - }); - } log.debug("put generated value: {}\t{}", nodeName, generatedValue); context.putGeneratedValue(nodeName, generatedValue); - if(X3MLEngine.ENABLE_ASSOCIATION_TABLE){ - this.createAssociationTable(generatedValue, genArg, extractAssocTableXPath(node)); - } + this.createAssociationTable(generatedValue, generator, node); } } } @@ -254,20 +244,8 @@ public ArgValue getArgValue(String name, SourceType sourceType, boolean mergeMul } } }); - GeneratedValue genArg=null; - if(generator.getName().equalsIgnoreCase("Literal")){ - genArg = context.policy().generate(generator, new Generator.ArgValues() { - @Override - public ArgValue getArgValue(String name, SourceType sourceType, boolean mergeMultipleValues) { - return context.input().evaluateArgument2(node, index, generator, name, sourceType); - - } - }); - } context.putGeneratedValue(nodeName, generatedValue); - if(X3MLEngine.ENABLE_ASSOCIATION_TABLE){ - this.createAssociationTable(generatedValue, genArg, extractAssocTableXPath(node)); - } + this.createAssociationTable(generatedValue, generator, node); } } } @@ -315,16 +293,6 @@ public ArgValue getArgValue(String name, SourceType sourceType, boolean mergeMul } } }); - GeneratedValue genArg=null; - if(generator.getName().equalsIgnoreCase("Literal")){ - genArg = context.policy().generate(generator, new Generator.ArgValues() { - @Override - public ArgValue getArgValue(String name, SourceType sourceType, boolean mergeMultipleValues) { - return context.input().evaluateArgument2(node, index, generator, name, sourceType); - - } - }); - } } context.putGeneratedValue(nodeName, generatedValue); @@ -338,21 +306,95 @@ public boolean conditionFails(Condition condition, GeneratorContext context) { return condition != null && condition.failure(context); } - private void createAssociationTable(GeneratedValue generatedValue, GeneratedValue generatedArg, String xpathProper){ - String value=""; - if(generatedValue.type == X3ML.GeneratedType.LITERAL){ - value="\""+generatedValue.text+"\""; + private void createAssociationTable(GeneratedValue generatedValue, GeneratorElement generator, Node node){ + if(X3MLEngine.ENABLE_ASSOCIATION_TABLE) { + String xpathProper = extractAssocTableXPath(node); + + String value=""; + if(generatedValue.type == GeneratedType.LITERAL || generatedValue.type == GeneratedType.TYPED_LITERAL) { + // we assume that there is argument named text for generators that generate Literal or Typed Literals + // and that this argument is of type xpath + String generatedArg = + generator.getArgs() + .stream() + .filter(arg -> SourceType.xpath.name().equals(arg.type)) + .findFirst() + .map(arg -> this.rewriteArgXPath(arg.value)) + .orElse(null); + + value="\""+generatedValue.text+"\""; + if(generatedArg != null) + xpathProper+="/"+generatedArg; + else + xpathProper+="/text()"; + } + else if(generatedValue.type == X3ML.GeneratedType.URI) { + value=generatedValue.text; + } - if(generatedArg!=null) - xpathProper+="/"+generatedArg.text; - else - xpathProper+="/text()"; - } - else if(generatedValue.type == X3ML.GeneratedType.URI) - value=generatedValue.text; if(xpathProper!=null){ //Needs a little more inspection this AssociationTable.addEntry(xpathProper,value); } + } + } + + private final Pattern NUMERIC_INDEX_PATTERN = Pattern.compile(".*\\[\\d+\\]$"); + private final Pattern FUNCTION_PATTERN = Pattern.compile(".*\\(.*\\)$"); + + /** + * In case of multiple intermediary elements we re-write xpath to always point to the first one + * because this is a default behaviour of non merging generators + */ + public String rewriteArgXPath(String xpath) { + // because we need to add [1] to every tag without index, + // but at the same time don't messup with function calls we are spliting xpath on "/" + // but doing this only if "/" is not inside function call or attribtue acces + List segments = new ArrayList<>(); + int lastSegmentStart = 0; + int bracketDepth = 0; + int parenthesisDepth = 0; + + for (int i = 0; i < xpath.length(); i++) { + char ch = xpath.charAt(i); + if (ch == '[') { + bracketDepth++; + } else if (ch == ']') { + bracketDepth--; + } else if (ch == '(') { + parenthesisDepth++; + } else if (ch == ')') { + parenthesisDepth--; + } else if (ch == '/' && bracketDepth == 0 && parenthesisDepth == 0 && i != 0) { + // we are not inside function call or attribute access + + // Check for double slash + if (i + 1 < xpath.length() && xpath.charAt(i + 1) == '/') { + i++; // Skip the next slash + } + + // If i is not 0, add the substring excluding the slash + if (i != 0) { + segments.add(xpath.substring(lastSegmentStart, i)); + } + lastSegmentStart = i + 1; // Move past the slash for the start of the next segment + } + } + + segments.add(xpath.substring(lastSegmentStart)); // Add the last segment + + for (int i = 0; i < segments.size(); i++) { + String segment = segments.get(i); + // Check if segment is not a function call, not a relative path, + // and does not already contain indexed access + if (!segment.isEmpty() && !segment.equals(".") && !segment.equals("..") + && !NUMERIC_INDEX_PATTERN.matcher(segment).matches() + && !FUNCTION_PATTERN.matcher(segment).matches()) { + segments.set(i, segment + "[1]"); + } + } + + // re-construct xpath + return String.join("/", segments); } /**Adds a new entry in the association table with the given XPATH expression and diff --git a/src/main/java/gr/forth/ics/isl/x3ml/engine/XPathInput.java b/src/main/java/gr/forth/ics/isl/x3ml/engine/XPathInput.java index a046d5c..cf18be3 100644 --- a/src/main/java/gr/forth/ics/isl/x3ml/engine/XPathInput.java +++ b/src/main/java/gr/forth/ics/isl/x3ml/engine/XPathInput.java @@ -151,58 +151,6 @@ public X3ML.ArgValue evaluateArgument(Node node, int index, GeneratorElement gen } return value; } - - public X3ML.ArgValue evaluateArgument2(Node node, int index, GeneratorElement generatorElement, String argName, SourceType defaultType) { - X3ML.GeneratorArg foundArg = null; - SourceType type = defaultType; - if (generatorElement.getArgs() != null) { - for (X3ML.GeneratorArg arg : generatorElement.getArgs()) { - if (arg.name == null) { - arg.name = "text"; - } - if (arg.name.equals(argName)) { - foundArg = arg; - type = sourceType(arg.type, defaultType); - } - } - - } - X3ML.ArgValue value = null; - switch (type) { - - case xpath: - if (foundArg == null) { - return null; - } - String lang = getLanguageFromSource(node); - if (lang == null) { - lang = languageFromMapping; - } - if (!foundArg.value.isEmpty()) { - value = argVal( foundArg.value.replaceAll("/", "[1]/"), lang); - if (value.string.isEmpty()) { - throw exception("Empty result for arg " + foundArg.name + " at node " + node.getNodeName() + " in generator\n" + generatorElement); - } - } - break; - case constant: - if (foundArg == null) { - return null; - } - value = argVal(foundArg.value, languageFromMapping); - break; - case position: - value = argVal(String.valueOf(index), null); - break; - case entireInput: - value=argVal(this.getEntireXpathInput(), languageFromMapping); - entireInputExportedRefUri=domainURIForNamedgraps; - break; - default: - throw new RuntimeException("Not implemented"); - } - return value; - } /** Returns the value that can be found in the corresponding node, after the evaluation * of the given XPath expression. More specifically it returns the results after diff --git a/src/test/java/eu/delving/x3ml/TestAssociationTable.java b/src/test/java/eu/delving/x3ml/TestAssociationTable.java new file mode 100644 index 0000000..89a64ad --- /dev/null +++ b/src/test/java/eu/delving/x3ml/TestAssociationTable.java @@ -0,0 +1,73 @@ +/*============================================================================== +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +==============================================================================*/ +package eu.delving.x3ml; + +import static eu.delving.x3ml.AllTests.document; +import static eu.delving.x3ml.AllTests.engine; +import static eu.delving.x3ml.AllTests.policy; +import static eu.delving.x3ml.AllTests.resource; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.io.IOUtils; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import gr.forth.ics.isl.x3ml.X3MLEngine; +import gr.forth.ics.isl.x3ml.engine.GeneratorContext; +import gr.forth.ics.isl.x3ml_reverse_utils.AssociationTable; + +public class TestAssociationTable { + + @BeforeClass + public static void setUp() { + X3MLEngine.ENABLE_ASSOCIATION_TABLE = true; + } + + @AfterClass + public static void tearDown() { + // because this flag is a static variable we need to make sure that we reset it + // after we are done with association table tests + X3MLEngine.ENABLE_ASSOCIATION_TABLE = false; + } + + @Before + public void before() { + AssociationTable.clearAssociationTable(); + } + + @Test + public void testCustomLiteralGenerator() throws IOException { + // test to check that proper xpath is generated not only for default Literal + // generator but also for a custom one like DateNormalizer + X3MLEngine engine = engine("/association_table/01_date-mappings.x3ml"); + X3MLEngine.Output output = engine.execute(document("/association_table/01_date-input.xml"), + policy("/association_table/01_date-generator-policy.xml")); + output.close(); + + String expected = IOUtils.toString( + resource("/association_table/01_date-expected-association-table.xml"), + StandardCharsets.UTF_8); + assertEquals(expected, GeneratorContext.exportAssociationTableToString()); + } +} diff --git a/src/test/resources/association_table/01_date-expected-association-table.xml b/src/test/resources/association_table/01_date-expected-association-table.xml new file mode 100644 index 0000000..01b3c4f --- /dev/null +++ b/src/test/resources/association_table/01_date-expected-association-table.xml @@ -0,0 +1,62 @@ + + + /record[1] + http://vocab.getty.edu/aat/300133025 + + + /record[1] + https://artresearch.net/resource/frick/work/991013309269707141 + + + /record[1]/controlfield[2] + https://artresearch.net/resource/frick/work/991013309269707141/production + + + /record[1]/controlfield[2] + https://artresearch.net/resource/frick/work/991013309269707141/production/timespan + + + /record[1]/controlfield[2]/substring(substring-after(text(), 'k'), 1, 4) + "1527-12-31T23:59:59" + + + /record[1]/controlfield[2]/substring(substring-after(text(), 'k'), 5, 4) + "1527-01-01T00:00:00" + + + /record[1]/datafield[2]/subfield[1] + https://artresearch.net/resource/frick/work/991013309269707141/production-timespan-appellation/F38A46F2-D8E8-3427-8010-70AA0BAB25FC + + + /record[1]/datafield[2]/subfield[1]/text() + "1527." + + + /record[1]/datafield[3]/subfield[5] + https://artresearch.net/resource/frick/type/5B02D2E1-391B-3BF8-9813-758D731C7183 + + + /record[1]/datafield[3]/subfield[5] + https://artresearch.net/resource/frick/work/991013309269707141/acquisition/94D0DF61-CB3E-3D02-B669-4A6F2A4F189F + + + /record[1]/datafield[3]/subfield[5]/string-join((../subfield[@code="a"]/text(),"-"),' ') + "Location: Frick Collection, - Acquisition: Henry Clay Frick Bequest, - -" + + + /record[1]/datafield[4]/subfield[1] + https://artresearch.net/resource/frick/type/8E0AC9AA-3D9D-357A-984E-9B1E52105392 + + + /record[1]/datafield[4]/subfield[1] + https://artresearch.net/resource/frick/work/991013309269707141/subject/54F23224-5B7B-3276-8C6D-16F231A0A4BB + + + /record[1]/datafield[4]/subfield[1]/../../datafield[@ind2="7" and @tag="650"][1]/subfield[@code="a"][1]/text() + "Portraits: Men: With hands: With hats: Head to right." + + + /record[1]/datafield[@tag="245"][1]/subfield[@code="a"][1]/text() + "Sir Thomas More." + + \ No newline at end of file diff --git a/src/test/resources/association_table/01_date-generator-policy.xml b/src/test/resources/association_table/01_date-generator-policy.xml new file mode 100644 index 0000000..e58274b --- /dev/null +++ b/src/test/resources/association_table/01_date-generator-policy.xml @@ -0,0 +1,39 @@ + + + + {subject_type}/{subject_id}/{identifier_type}/{identifier_id} + + + {type}/{id} + + + + + + + + {type}/{id} + + + {resourceType}/{resourceId}/{eventType} + + + {resourceType}/{resourceId}/{eventType}/timespan + + + + + + + + + + {resourceType}/{resourceId}/subject/{subject_id} + + + {resourceType}/{resourceId}/{eventType}/{from}{to}{timespan} + + + Location: {To} Acquisition: {Type} {Date} + + \ No newline at end of file diff --git a/src/test/resources/association_table/01_date-input.xml b/src/test/resources/association_table/01_date-input.xml new file mode 100644 index 0000000..e7c6c67 --- /dev/null +++ b/src/test/resources/association_table/01_date-input.xml @@ -0,0 +1,30 @@ + + + 13826nkc a2202221 a 4500 + 991013309269707141 + 110428k15271527xx nnn | | cneng|d + + + Sir Thomas More. + + + 1527. + + + Frick Collection, + New York, + New York, + United States, + Henry Clay Frick Bequest, + 1912.1.77, + public. + + + Art, German. + http://id.loc.gov/authorities/subjects/sh85007675 + + + Portraits: Men: With hands: With hats: Head to right. + local + + \ No newline at end of file diff --git a/src/test/resources/association_table/01_date-mappings.x3ml b/src/test/resources/association_table/01_date-mappings.x3ml new file mode 100644 index 0000000..773d6fb --- /dev/null +++ b/src/test/resources/association_table/01_date-mappings.x3ml @@ -0,0 +1,227 @@ + + + + + + + + + + + + + + + /record + + + crm:E22_Human-Made_Object + + work + controlfield[@tag="001"] + + + datafield[@tag="245"]/subfield[@code="a"]/text() + + + crm:P2_has_type + + crm:E55_Type + + http://vocab.getty.edu/aat/300133025 + + + + + + + + + + controlfield[@tag="008"] + + + crm:P108i_was_produced_by + + crm:E12_Production + + work + /record/controlfield[@tag="001"] + production + + + crm:P4_has_time-span + + crm:E52_Time-Span + + work + /record/controlfield[@tag="001"] + production + + + crm:P82a_begin_of_the_begin + + + + controlfield[@tag="008"] + + + xsd:dateTime + + Upper + Date_and_Time + substring(substring-after(text(), 'k'), 1, 4) + + + + + + + + + controlfield[@tag="008"] + + + crm:P108i_was_produced_by + + crm:E12_Production + + crm:P4_has_time-span + + crm:E52_Time-Span + + crm:P82b_end_of_the_end + + + + controlfield[@tag="008"] + + + xsd:dateTime + + Lower + Date_and_Time + substring(substring-after(text(), 'k'), 5, 4) + + + + + + + + + datafield[@tag="260"]/subfield[@code="c"] + + + crm:P108i_was_produced_by + + crm:E12_Production + + crm:P4_has_time-span + + crm:E52_Time-Span + + crm:P1_is_identified_by + + crm:E41_Appellation + + work + /record/controlfield[@tag="001"] + production-timespan-appellation + text() + + + crm:P190_has_symbolic_content + + + + datafield[@tag="260"]/subfield[@code="c"] + + + rdfs:Literal + + text() + + + + + + + + + datafield[@ind2="0" and @tag="650"]/subfield[@code="a"] + + + crm:P65_shows_visual_item + + crm:E36_Visual_Item + + work + /record/controlfield[@tag="001"] + ../../datafield[@ind2="7" and @tag="650"]/subfield[@code="a"]/text() + + + ../../datafield[@ind2="7" and @tag="650"]/subfield[@code="a"]/text() + + + crm:P2_has_type + + + + datafield[@ind2="0" and @tag="650"]/subfield[@code="a"] + + + custom:noType + + type + text() + + + + + + + + + datafield[@tag="590" and @ind1="1"]/subfield[@code="e"] + + + crm:P24i_changed_ownership_through + + crm:E8_Acquisition + + work + /record/controlfield[@tag="001"] + acquisition + - + ../subfield[@code="a"]/text() + - + + + string-join((../subfield[@code="a"]/text(),"-"),' ') + string-join((text(),"-"),' ') + string-join((../subfield[@code="l"]/text(),"-"),' ') + + + crm:P2_has_type + + + + datafield[@tag="590" and @ind1="1"]/subfield[@code="e"] + + + custom:noType + + type + text() + + + + + + + +