Merge branch 'wip-0.2.2' of github.com:rdelbru/SIREn into wip-0.2.2

laurensdv · Nov 11, 2011 · cb8cbff · cb8cbff
2 parents 8d3bb27 + 864c069
commit cb8cbff
Show file tree

Hide file tree

Showing 36 changed files with 562 additions and 176 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,5 +1,5 @@
 
-======================= Release 0.2.2 2011-xx-xx =======================
+======================= Release 0.2.2-RC1 2011-11-07 =======================
 
 New Features
 
@@ -12,10 +12,16 @@ Improvements
 
 * [GH-22] Upgrade to Lucene/Solr 3.4
 
+Bugs
+
+* [SRN-116] Question mark not correctly escaped in keyword search
+* [SRN-117] Phrase Query problem with uri local name
+
 ======================= Release 0.2.1 2011-09-13 =======================
 
 Bugs
 
+* [GH-15] siren-core - Fixed POM issue caused by the caliper dependency on gson
 * [SRN-25] Counting of matchers in SirenDisjunctionScorer is wrong
 * [SRN-79] Make URINormalisationFilter less agressive: token length minimum
            limit is now set to 3.
@@ -46,7 +52,7 @@ Improvements
 * [SRN-100] Improve user-readable version of the query
 * [SRN-102] Catch SolrException in SIREnQParser#parse
 * [SRN-105] Upgrade to Lucene/Solr 3.1
-* []SRN-108] Improved support of encoded character in URI
+* [SRN-108] Improved support of encoded character in URI
 * [SRN-112] Tokenisation problem with URI localnames
 
 * [GH-1] Pom fixes

diff --git a/pom.xml b/pom.xml
@@ -1,12 +1,11 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 
   <modelVersion>4.0.0</modelVersion>
 
   <groupId>org.sindice.siren</groupId>
   <artifactId>siren-aggregator</artifactId>
-  <version>1-SNAPSHOT</version>
+  <version>0.2.2-RC2-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>siren-aggregator</name>
 
@@ -17,4 +16,34 @@
     <module>siren-qparser</module>
   </modules>
 
+  <scm>
+    <connection>scm:git:ssh://[email protected]/rdelbru/SIREn.git</connection>
+    <developerConnection>scm:git:ssh://[email protected]/rdelbru/SIREn.git</developerConnection>
+    <url>git:ssh://[email protected]/rdelbru/SIREn.git</url>
+  </scm>
+
+  <!-- To avoid an error when tagging -->
+  <build>
+    <pluginManagement>
+      <plugins>
+       <plugin>
+         <groupId>org.apache.maven.plugins</groupId>
+         <artifactId>maven-release-plugin</artifactId>
+         <version>2.2.1</version>
+       </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
+
+  <!-- distributionManagement>
+    <repository>
+      <id>repo</id>
+      <url>https://github.com/rdelbru/maven-repository/raw/master/releases</url>
+    </repository>
+    <snapshotRepository>
+      <id>snapshot-repo</id>
+      <url>https://github.com/rdelbru/maven-repository/raw/master/snapshots</url>
+    </snapshotRepository>
+  </distributionManagement -->
+
 </project>
diff --git a/siren-core/pom.xml b/siren-core/pom.xml
@@ -1,18 +1,17 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
-  xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
   <modelVersion>4.0.0</modelVersion>
 
   <parent>
     <artifactId>siren-parent</artifactId>
     <groupId>org.sindice.siren</groupId>
-    <version>0.2.2-SNAPSHOT</version>
+    <version>0.2.2-RC2-SNAPSHOT</version>
     <relativePath>../siren-parent</relativePath>
   </parent>
 
   <groupId>org.sindice.siren</groupId>
   <artifactId>siren-core</artifactId>
-  <version>0.2.2-SNAPSHOT</version>
+  <version>0.2.2-RC2-SNAPSHOT</version>
   <name>siren-core</name>
   <url>http://siren.sindice.org</url>
 
@@ -89,16 +88,17 @@
       <groupId>org.apache.lucene</groupId>
       <artifactId>lucene-test-framework</artifactId>
       <version>${lucene.version}</version>
-			<scope>test</scope>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>com.google.code.caliper</groupId>
+      <artifactId>caliper</artifactId>
+      <version>1.0-SNAPSHOT</version>
+      <scope>test</scope>
     </dependency>
-
-		<dependency>
-		  <groupId>com.google.code.caliper</groupId>
-		  <artifactId>caliper</artifactId>
-		  <version>1.0-SNAPSHOT</version>
-			<scope>test</scope>
-		</dependency>
 
+    <!-- Because of a dependency issue in caliper -->
     <dependency>
       <groupId>com.google.code.gson</groupId>
       <artifactId>gson</artifactId>

diff --git a/siren-core/src/main/java/org/sindice/siren/analysis/AnyURIAnalyzer.java b/siren-core/src/main/java/org/sindice/siren/analysis/AnyURIAnalyzer.java
@@ -47,7 +47,19 @@
 import org.sindice.siren.analysis.filter.URITrailingSlashFilter;
 
 /**
- * Analyzer designed to deal with any kind of URIs.
+ * Analyzer designed to deal with any kind of URIs and perform some post-processing
+ * on URIs.
+ * <br>
+ * The URI normalisation can be configured. You can disable it, activate it
+ * only on URI local name, or on the full URI. However, URI normalisation on the
+ * full URI is costly in term of CPU at indexing time, and can double the size
+ * of the index, since each URI is duplicated by n tokens.
+ * <br>
+ * By default, the URI normalisation is disabled.
+ * <br>
+ * When full uri normalisation is activated, the analyzer is much slower than
+ * the WhitespaceTupleAnalyzer. If you are not indexing RDF data, consider to
+ * use the WhitespaceTupleAnalyzer instead.
  */
 public class AnyURIAnalyzer extends Analyzer {
 

diff --git a/siren-core/src/main/java/org/sindice/siren/analysis/NumericAnalyzer.java b/siren-core/src/main/java/org/sindice/siren/analysis/NumericAnalyzer.java
@@ -69,12 +69,6 @@ public final TokenStream reusableTokenStream(final String fieldName, final Reade
       streams.tokenStream = new SirenNumericTokenStream(precisionStep);
       this.setNumericValue(streams.tokenStream, reader);
     } else {
-//      streams.tokenStream.reset();
-      /*
-       * Calling reset would throw an exception if a previous call to this stream
-       * failed somehow.
-       * Anyway, the reset just reset values which are assigned later by the set***Value method
-       */
       this.setNumericValue(streams.tokenStream, reader);
     }
     return streams.tokenStream;

diff --git a/siren-core/src/main/java/org/sindice/siren/analysis/TupleAnalyzer.java b/siren-core/src/main/java/org/sindice/siren/analysis/TupleAnalyzer.java
@@ -34,24 +34,13 @@
 import org.apache.lucene.analysis.CharArrayMap;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.Version;
+import org.sindice.siren.analysis.filter.DatatypeAnalyzerFilter;
 import org.sindice.siren.analysis.filter.SirenDeltaPayloadFilter;
 import org.sindice.siren.analysis.filter.TokenTypeFilter;
-import org.sindice.siren.analysis.filter.DatatypeAnalyzerFilter;
 
 /**
  * The TupleAnalyzer is especially designed to process RDF data. It applies
  * various post-processing on URIs and Literals.
- * <br>
- * The URI normalisation can be configured. You can disable it, activate it
- * only on URI local name, or on the full URI. However, URI normalisation on the
- * full URI is costly in term of CPU at indexing time, and can double the size
- * of the index, since each URI is duplicated by n tokens.
- * <br>
- * By default, the URI normalisation is disabled.
- * <br>
- * When full uri normalisation is activated, the analyzer is much slower than
- * the WhitespaceTupleAnalyzer. If you are not indexing RDF data, consider to
- * use the WhitespaceTupleAnalyzer instead.
  */
 public class TupleAnalyzer extends Analyzer {
 
@@ -62,6 +51,12 @@ public class TupleAnalyzer extends Analyzer {
 
   private final CharArrayMap<Analyzer> regLitAnalyzers;
 
+  /**
+   * Create a {@link TupleAnalyzer} with the default {@link Analyzer} for Literals and URIs.
+   * @param version
+   * @param stringAnalyzer default Literal {@link Analyzer}
+   * @param anyURIAnalyzer default URI {@link Analyzer}
+   */
   public TupleAnalyzer(Version version, final Analyzer stringAnalyzer, final Analyzer anyURIAnalyzer) {
     matchVersion = version;
     this.stringAnalyzer = stringAnalyzer;
@@ -78,12 +73,21 @@ public void setAnyURIAnalyzer(final Analyzer analyzer) {
     anyURIAnalyzer = analyzer;
   }
 
+  /**
+   * Assign an {@link Analyzer} to be used with that key. That analyzer is used
+   * to process tokens outputed from the {@link TupleTokenizer}.
+   * @param datatype
+   * @param a
+   */
   public void registerLiteralAnalyzer(char[] datatype, Analyzer a) {
     if (!regLitAnalyzers.containsKey(datatype)) {
       regLitAnalyzers.put(datatype, a);
     }
   }
 
+  /**
+   * Remove all registered {@link Analyzer}s.
+   */
   public void clearRegisterLiteralAnalyzers() {
     regLitAnalyzers.clear();
   }

diff --git a/siren-core/src/main/java/org/sindice/siren/analysis/TupleTokenizer.java b/siren-core/src/main/java/org/sindice/siren/analysis/TupleTokenizer.java
@@ -40,7 +40,7 @@
 
 /**
  * A grammar-based tokenizer constructed with JFlex for N-Tuples. Splits a
- * N-Tuple into BNode, URI, Literal, Dot tokens.
+ * N-Tuple into BNode, URI, Literal and Dot tokens.
  */
 public class TupleTokenizer
 extends Tokenizer {

diff --git a/siren-core/src/main/java/org/sindice/siren/analysis/WhitespaceAnyURIAnalyzer.java b/siren-core/src/main/java/org/sindice/siren/analysis/WhitespaceAnyURIAnalyzer.java
@@ -41,7 +41,8 @@
 import org.sindice.siren.analysis.filter.AssignTokenTypeFilter;
 
 /**
- * Analyzer designed to deal with any kind of URIs.
+ * Analyzer designed to deal with any kind of URIs. It does not perform any
+ * post-processing on URIs. Only the {@link LowerCaseFilter} is used.
  */
 public class WhitespaceAnyURIAnalyzer extends Analyzer {
 

diff --git a/siren-core/src/main/java/org/sindice/siren/analysis/filter/DatatypeAnalyzerFilter.java b/siren-core/src/main/java/org/sindice/siren/analysis/filter/DatatypeAnalyzerFilter.java
@@ -60,6 +60,7 @@ public class DatatypeAnalyzerFilter extends TokenFilter {
 
   private final CharArrayMap<Analyzer> dtsAnalyzer;
 
+  //
   private CharTermAttribute termAtt;
   private OffsetAttribute offsetAtt;
   private PositionIncrementAttribute posIncrAtt;
@@ -85,6 +86,14 @@ public DatatypeAnalyzerFilter(final Version version,
     this.initAttributes();
   }
 
+  /**
+   * Create a {@link DatatypeAnalyzerFilter} with the given default {@link Analyzer}
+   * for URI an Literal.
+   * @param version The Lucene version to use
+   * @param input the input token stream
+   * @param stringAnalyzer the default Literal {@link Analyzer}
+   * @param anyURIAnalyzer the default URI {@link Analyzer}
+   */
   public DatatypeAnalyzerFilter(final Version version,
                                 final TokenStream input,
                                 final Analyzer stringAnalyzer,

diff --git a/siren-core/src/main/java/org/sindice/siren/analysis/filter/URILocalnameFilter.java b/siren-core/src/main/java/org/sindice/siren/analysis/filter/URILocalnameFilter.java
@@ -87,6 +87,7 @@ public final boolean incrementToken() throws java.io.IOException {
 
     // While we are normalising the URI
     if (_isNormalising) {
+      this.posIncrAtt.setPositionIncrement(1); // reset the position increment
       this.nextToken();
       return true;
     }

diff --git a/siren-core/src/main/java/org/sindice/siren/analysis/filter/URINormalisationFilter.java b/siren-core/src/main/java/org/sindice/siren/analysis/filter/URINormalisationFilter.java
@@ -71,6 +71,7 @@ public final boolean incrementToken() throws java.io.IOException {
 
     // While we are normalising the URI
     if (_isNormalising) {
+      this.posIncrAtt.setPositionIncrement(1); // reset the position increment
       this.nextToken();
       return true;
     }

diff --git a/siren-core/src/main/java/org/sindice/siren/search/SirenTopTermsRewrite.java b/siren-core/src/main/java/org/sindice/siren/search/SirenTopTermsRewrite.java
@@ -31,9 +31,10 @@
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TopTermsRewrite;
 
 /**
- * 
+ * Code taken from {@link TopTermsRewrite} in order to use {@link SirenMultiTermQuery}.
  */
 public abstract class SirenTopTermsRewrite<Q extends Query> extends SirenTermCollectingRewrite<Q> {
 

diff --git a/siren-core/src/test/java/org/sindice/siren/analysis/TestTupleAnalyzer.java b/siren-core/src/test/java/org/sindice/siren/analysis/TestTupleAnalyzer.java
@@ -47,6 +47,8 @@
 import org.sindice.siren.analysis.AnyURIAnalyzer.URINormalisation;
 import org.sindice.siren.analysis.attributes.CellAttribute;
 import org.sindice.siren.analysis.attributes.TupleAttribute;
+import org.sindice.siren.analysis.filter.URILocalnameFilter;
+import org.sindice.siren.analysis.filter.URINormalisationFilter;
 
 public class TestTupleAnalyzer {
 
@@ -82,7 +84,7 @@ public void assertAnalyzesTo(final Analyzer a, final String input,
                                 final String[] expectedTypes,
                                 final int[] expectedPosIncrs)
   throws Exception {
-    this.assertAnalyzesTo(a, input, expectedImages, expectedTypes, null, null,
+    this.assertAnalyzesTo(a, input, expectedImages, expectedTypes, expectedPosIncrs, null,
       null);
   }
 
@@ -150,6 +152,46 @@ public void assertAnalyzesTo(final Analyzer a, final String input,
     t.close();
   }
 
+  /**
+   * Test the local URINormalisation: the word "the" is a stop word, hence it is
+   * filtered. The position increment is updated accordingly, but it is not reset for
+   * future calls. Corrects issue SRN-117.
+   * @throws Exception
+   */
+  @Test
+  public void testURINormalisation()
+  throws Exception {
+    final AnyURIAnalyzer uriAnalyzer = new AnyURIAnalyzer(Version.LUCENE_34);
+    uriAnalyzer.setUriNormalisation(URINormalisation.LOCALNAME);
+    _a = new TupleAnalyzer(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31), uriAnalyzer);
+
+    this.assertAnalyzesTo(_a, "<http://dbpedia.org/resource/The_Kingston_Trio>",
+                          new String[] { "kingston", "trio", "the_kingston_trio",
+                                         "http://dbpedia.org/resource/the_kingston_trio" },
+                          new String[] { "<URI>", "<URI>", "<URI>", "<URI>" },
+                          new int[] { 2, 1, 0, 0 });
+  }
+
+  /**
+   * The same, with Full normalisation -- the stop word is now "their" because in
+   * {@link URINormalisationFilter}, there is inside a filter of words smaller
+   * than 4 (it was 3 for {@link URILocalnameFilter}. 
+   * @throws Exception
+   */
+  @Test
+  public void testURINormalisation2()
+  throws Exception {
+    final AnyURIAnalyzer uriAnalyzer = new AnyURIAnalyzer(Version.LUCENE_34);
+    uriAnalyzer.setUriNormalisation(URINormalisation.FULL);
+    _a = new TupleAnalyzer(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31), uriAnalyzer);
+
+    this.assertAnalyzesTo(_a, "<http://dbpedia.org/resource/their_Kingston_Trio>",
+                          new String[] { "dbpedia", "resource", "kingston", "trio",
+                                         "http://dbpedia.org/resource/their_kingston_trio" },
+                          new String[] { "<URI>", "<URI>", "<URI>", "<URI>", "<URI>" },
+                          new int[] { 1, 1, 2, 1, 0 });
+  }
+
   @Test
   public void testURI()
   throws Exception {

diff --git a/siren-core/src/test/java/org/sindice/siren/analysis/filter/TestURILocalnameFilter.java b/siren-core/src/test/java/org/sindice/siren/analysis/filter/TestURILocalnameFilter.java
@@ -65,7 +65,7 @@ public void assertNormalisesTo(final Tokenizer t, final String input,
                                 final String[] expectedTypes,
                                 final int[] expectedPosIncrs)
   throws Exception {
-    this.assertNormalisesTo(t, input, expectedImages, expectedTypes, null, null,
+    this.assertNormalisesTo(t, input, expectedImages, expectedTypes, expectedPosIncrs, null,
       null);
   }
 
@@ -153,6 +153,21 @@ public void testURI()
       new String[] { "uppercase", "Should", "Tokenised", "uppercaseShouldBeTokenised", "http://renaud.delbru.fr/rdf/uppercaseShouldBeTokenised" });
     this.assertNormalisesTo(_t, "<http://renaud.delbru.fr/rdf/AVeryLongLocalnameWithMoreThan64CharactersThatShouldNotBeTokenised>",
       new String[] { "AVeryLongLocalnameWithMoreThan64CharactersThatShouldNotBeTokenised", "http://renaud.delbru.fr/rdf/AVeryLongLocalnameWithMoreThan64CharactersThatShouldNotBeTokenised" });
+
+
+    final String triple = "<http://dbpedia.org/resource/The_Kingston_Trio> " +
+                          "<http://purl.org/dc/terms/subject>  " +
+                          "<http://dbpedia.org/resource/Category:Decca_Records_artists>";
+    this.assertNormalisesTo(_t, triple,
+        new String[] { "The", "Kingston", "Trio", "The_Kingston_Trio", "http://dbpedia.org/resource/The_Kingston_Trio",
+                       "subject", "http://purl.org/dc/terms/subject",
+                       "Category", "Decca", "Records", "artists", "Category:Decca_Records_artists", "http://dbpedia.org/resource/Category:Decca_Records_artists" },
+        new String[] { "<URI>", "<URI>", "<URI>", "<URI>", "<URI>",
+                       "<URI>", "<URI>",
+                       "<URI>", "<URI>", "<URI>", "<URI>", "<URI>", "<URI>" },
+        new int[] { 1, 1, 1, 0, 0,
+                    1, 0,
+                    1, 1, 1, 1, 0, 0 });
   }
 
   @Test