Fixed tilde query problem

jsteggink · May 26, 2011 · 996b945 · 996b945
1 parent 6699299
commit 996b945
Show file tree

Hide file tree

Showing 9 changed files with 127 additions and 23 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,3 +20,4 @@ out/
 
 # Other
 bin/
+siren-solr/src/test/resources/solr.home/data/
diff --git a/siren-core/src/main/java/org/sindice/siren/analysis/filter/URITrailingSlashFilter.java b/siren-core/src/main/java/org/sindice/siren/analysis/filter/URITrailingSlashFilter.java
@@ -23,42 +23,67 @@
 package org.sindice.siren.analysis.filter;
 
 import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.sindice.siren.analysis.TupleTokenizer;
 
 /**
  * Filter that removes the trailing slash to token of type
- * {@code TupleTokenizer.URI}
+ * {@link TupleTokenizer.URI} or {@link UAX29URLEmailTokenizer.URL_TYPE}
  */
 public class URITrailingSlashFilter extends TokenFilter {
 
-  private final String tokenType;
+  private Set<String> tokenTypes = new HashSet<String>();
+  private final static Set<String> DEFAULT_TOKEN_TYPES = new HashSet<String>();
+
   private final CharTermAttribute termAtt;
   private final TypeAttribute typeAtt;
 
+  // by default, check the token type
+  public static final boolean DEFAULT_CHECKTYPE = true;
+  private boolean checkType = DEFAULT_CHECKTYPE;
+
+  static {
+    DEFAULT_TOKEN_TYPES.add(TupleTokenizer.getTokenTypes()[TupleTokenizer.URI]);
+    DEFAULT_TOKEN_TYPES.add(UAX29URLEmailTokenizer.URL_TYPE);
+  }
+
   public URITrailingSlashFilter(final TokenStream in) {
-    this(in, TupleTokenizer.getTokenTypes()[TupleTokenizer.URI]);
+    this(in, new HashSet<String>(DEFAULT_TOKEN_TYPES));
   }
 
   public URITrailingSlashFilter(final TokenStream in, final String tokenType) {
     super(in);
-    this.tokenType = tokenType;
+    this.tokenTypes.add(tokenType);
     termAtt = this.addAttribute(CharTermAttribute.class);
     typeAtt = this.addAttribute(TypeAttribute.class);
   }
 
+  public URITrailingSlashFilter(final TokenStream in, final Set<String> tokenTypes) {
+    super(in);
+    this.tokenTypes = tokenTypes;
+    termAtt = this.addAttribute(CharTermAttribute.class);
+    typeAtt = this.addAttribute(TypeAttribute.class);
+  }
+
+  public void setCheckTokenType(final boolean checkType) {
+    this.checkType = checkType;
+  }
+
   @Override
   public final boolean incrementToken() throws IOException {
     if (!input.incrementToken()) {
       return false;
     }
 
     final String type = typeAtt.type();
-    if (type.equals(tokenType)) {
+    if (checkType ? tokenTypes.contains(type) : true) {
       final int bufferLength = termAtt.length();
       // Remove trailing slash
       if (termAtt.buffer()[bufferLength - 1] == '/') {

diff --git a/siren-qparser/src/main/java/org/sindice/siren/qparser/keyword/KeywordQParserImpl.java b/siren-qparser/src/main/java/org/sindice/siren/qparser/keyword/KeywordQParserImpl.java
@@ -70,11 +70,12 @@ public Query parse(final String query) throws ParseException {
     }
   }
 
+  // TODO: does not support mailto: uri
   static String uriRegExp = "(news|(ht|f)tp(s?))\\://[a-zA-Z0-9\\-\\.]+\\.[a-zA-Z]{2,3}(/\\S*)?";
   static Pattern pattern = Pattern.compile(uriRegExp);
 
   // TODO: check if other special characters of lucene can appear in a URI, and
-  // escape them
+  // escape them, for example ~.
   private String escapeURIs(final String query) {
     final Matcher matcher = pattern.matcher(query);
 
@@ -96,7 +97,7 @@ public static String escape(final String s) {
     for (int i = 0; i < s.length(); i++) {
       final char c = s.charAt(i);
       // These characters are part of the field query syntax and must be escaped
-      if (c == ':' || c == '\\') {
+      if (c == ':' || c == '\\' || c == '~') {
         sb.append('\\');
       }
       sb.append(c);

diff --git a/...ser/src/main/java/org/sindice/siren/qparser/keyword/config/KeywordQueryConfigHandler.java b/...ser/src/main/java/org/sindice/siren/qparser/keyword/config/KeywordQueryConfigHandler.java
@@ -53,7 +53,7 @@ public KeywordQueryConfigHandler() {
     this.addAttribute(MultiTermRewriteMethodAttribute.class);
     this.addAttribute(AllowFuzzyAndWildcardAttribute.class);
     this.addAttribute(AllowLeadingWildcardAttribute.class);
-    this.addAttribute(PositionIncrementsAttribute.class);
+    this.addAttribute(PositionIncrementsAttribute.class).setPositionIncrementsEnabled(true);
     this.addAttribute(LocaleAttribute.class);
     this.addAttribute(DefaultPhraseSlopAttribute.class);
     this.addAttribute(MultiTermRewriteMethodAttribute.class);

diff --git a/...n/java/org/sindice/siren/qparser/keyword/query/processors/AnalyzerQueryNodeProcessor.java b/...n/java/org/sindice/siren/qparser/keyword/query/processors/AnalyzerQueryNodeProcessor.java
@@ -49,7 +49,6 @@
 import org.apache.lucene.queryParser.standard.config.AnalyzerAttribute;
 import org.apache.lucene.queryParser.standard.config.PositionIncrementsAttribute;
 import org.apache.lucene.queryParser.standard.nodes.MultiPhraseQueryNode;
-import org.apache.lucene.queryParser.standard.nodes.StandardBooleanQueryNode;
 import org.apache.lucene.queryParser.standard.nodes.WildcardQueryNode;
 
 /**
@@ -63,11 +62,14 @@
  * If the analyzer return only one term, the returned term is set to the
  * {@link FieldQueryNode} and it's returned. <br/>
  * <br/>
- * If the analyzer return more than one term, a {@link OrQueryNode},
+ * If the analyzer return more than one term
  * {@link TokenizedPhraseQueryNode} or {@link MultiPhraseQueryNode} is created,
  * whether there is one or more
  * terms at the same position, and it's returned. <br/>
  * <br/>
+ * A {@link OrQueryNode} can be returned if query expansion is detected, i.e.,
+ * more than one term at the same position.
+ * <br/>
  * If no term is returned by the analyzer a {@link NoTokenFoundQueryNode} object
  * is returned. <br/>
  * <br/>
@@ -183,7 +185,8 @@ protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeExcept
       if (numTokens == 0) {
         return new NoTokenFoundQueryNode();
 
-      } else if (numTokens == 1) {
+      }
+      else if (numTokens == 1) {
         String term = null;
         try {
           boolean hasNext;
@@ -199,13 +202,18 @@ protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeExcept
 
         return fieldNode;
 
-      } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
+      }
+      else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
         if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
           // no phrase query:
           final LinkedList<QueryNode> children = new LinkedList<QueryNode>();
 
+          int position = -1;
+
           for (int i = 0; i < numTokens; i++) {
             String term = null;
+            final int positionIncrement = 1;
+
             try {
               final boolean hasNext = buffer.incrementToken();
               assert hasNext == true;
@@ -215,21 +223,31 @@ protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeExcept
               // safe to ignore, because we know the number of tokens
             }
 
-            children.add(new FieldQueryNode(field, term, -1, -1));
+            final FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);
 
+            if (this.positionIncrementsEnabled) {
+              position += positionIncrement;
+              newFieldNode.setPositionIncrement(position);
+            } else {
+              newFieldNode.setPositionIncrement(i);
+            }
+
+            children.add(new FieldQueryNode(field, term, -1, -1));
           }
 
           // If multiple terms at one single position, this must be a query
           // expansion. Perform a OR between the terms.
           if (severalTokensAtSamePosition && positionCount == 1) {
             return new GroupQueryNode(new OrQueryNode(children));
           }
-          else if (positionCount == 1) {
-            return new GroupQueryNode(
-              new StandardBooleanQueryNode(children, true));
-          }
+          // if several tokens at same position && position count > 1, then
+          // results can be unexpected
           else {
-            return new StandardBooleanQueryNode(children, false);
+            final TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();
+            for (int i = 0; i < children.size(); i++) {
+              pq.add(children.get(i));
+            }
+            return pq;
           }
 
         }

diff --git a/siren-solr/src/main/java/org/sindice/siren/solr/analysis/URITrailingSlashFilterFactory.java b/siren-solr/src/main/java/org/sindice/siren/solr/analysis/URITrailingSlashFilterFactory.java
@@ -22,16 +22,32 @@
  */
 package org.sindice.siren.solr.analysis;
 
+import java.util.Map;
+
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.solr.analysis.BaseTokenFilterFactory;
 import org.sindice.siren.analysis.filter.URITrailingSlashFilter;
 
 public class URITrailingSlashFilterFactory
 extends BaseTokenFilterFactory {
 
+  public static final String CHECKTYPE_KEY = "checkTokenType";
+
+  private boolean checkType = true;
+
+  @Override
+  public void init(final Map<String,String> args) {
+   super.init(args);
+   this.assureMatchVersion();
+   final String check = args.get(CHECKTYPE_KEY);
+   checkType = (check != null ? Boolean.parseBoolean(check) : URITrailingSlashFilter.DEFAULT_CHECKTYPE);
+  }
+
   @Override
   public TokenStream create(final TokenStream input) {
-    return new URITrailingSlashFilter(input);
+    final URITrailingSlashFilter filter = new URITrailingSlashFilter(input);
+    filter.setCheckTokenType(checkType);
+    return filter;
   }
 
 }
diff --git a/siren-solr/src/test/java/org/sindice/siren/solr/TestSirenQParserPlugin.java b/siren-solr/src/test/java/org/sindice/siren/solr/TestSirenQParserPlugin.java
@@ -263,4 +263,40 @@ public void testASCIIFoldingExpansion() throws IOException, SolrServerException
     Assert.assertTrue("id2 should get higher score than id1", score1 < score2);
   }
 
+  @Test
+  public void testMailto() throws IOException, SolrServerException {
+    this.addNTripleString("id1", "<http://s> <http://p> <mailto:[email protected]> .");
+    SolrQuery query = new SolrQuery();
+    query.setQuery("mailto:[email protected]");
+    query.setQueryType("siren");
+
+    String[] results = wrapper.search(query, "url");
+    assertEquals(1, results.length);
+
+    query = new SolrQuery();
+    query.setQuery("[email protected]");
+    query.setQueryType("siren");
+
+    results = wrapper.search(query, "url");
+    assertEquals(1, results.length);
+  }
+
+  @Test
+  public void testTildeInURI() throws IOException, SolrServerException {
+    this.addNTripleString("id1", "<http://s> <http://p> <http://sw.deri.org/~aidanh/> .");
+    SolrQuery query = new SolrQuery();
+    query.setQuery("http://sw.deri.org/~aidanh/");
+    query.setQueryType("siren");
+
+    String[] results = wrapper.search(query, "url");
+    assertEquals(1, results.length);
+
+    query = new SolrQuery();
+    query.setQuery("http://sw.deri.org/~aidanh");
+    query.setQueryType("siren");
+
+    results = wrapper.search(query, "url");
+    assertEquals(1, results.length);
+  }
+
 }
diff --git a/siren-solr/src/test/resources/solr.home/conf/ntriple-schema.xml b/siren-solr/src/test/resources/solr.home/conf/ntriple-schema.xml
@@ -44,6 +44,7 @@
     -->
     <fieldType name="ntriple-uri" class="org.apache.solr.schema.SubTextField">
       <analyzer type="query">
+      	<!-- whitespace tokenizer to not tokenize URI -->   
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
 
         <!-- Remove trailing slash of URIs. -->       
@@ -130,7 +131,12 @@
     -->
     <fieldType name="ntriple-keyword" class="org.apache.solr.schema.SubTextField">
       <analyzer type="query">
+      	<!-- whitespace tokenizer to not tokenize URI -->
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+
+				<!-- Remove trailing slash of URIs. -->       
+        <filter class="org.sindice.siren.solr.analysis.URITrailingSlashFilterFactory"
+				        checkTokenType="false"/>
 
         <!-- Filters out those tokens *not* having length min through max 
              inclusive. -->

diff --git a/siren-solr/src/test/resources/solr.home/conf/schema.xml b/siren-solr/src/test/resources/solr.home/conf/schema.xml
@@ -100,10 +100,8 @@
 
       </analyzer>
       <analyzer type="query">
-        <!-- UAX29URLEmailTokenizer for query in order to recognise URI, 
-             email, and QName.
-        -->
-        <tokenizer class="solr.UAX29URLEmailTokenizerFactory"/>
+        <!-- whitespace tokenizer to not tokenize URI -->
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
 
         <!-- Filters out those tokens *not* having length min through max 
              inclusive. -->
@@ -155,6 +153,9 @@
         <!-- Remove trailing slash of URIs. -->				
 				<filter class="org.sindice.siren.solr.analysis.URITrailingSlashFilterFactory"/>
 
+        <!-- Tokenize mailto URI -->
+        <filter class="org.sindice.siren.solr.analysis.MailtoFilterFactory"/>
+
 				<!-- Filters out those tokens *not* having length min through max 
              inclusive. -->
         <filter class="solr.LengthFilterFactory" min="2" max="256"/>
Original file line number	Diff line number	Diff line change
Expand Up		@@ -20,3 +20,4 @@ out/

		# Other
		bin/
		siren-solr/src/test/resources/solr.home/data/