Skip to content

Commit

Permalink
Fixed tilde query problem
Browse files Browse the repository at this point in the history
  • Loading branch information
rdelbru committed May 26, 2011
1 parent 6699299 commit 996b945
Show file tree
Hide file tree
Showing 9 changed files with 127 additions and 23 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ out/

# Other
bin/
siren-solr/src/test/resources/solr.home/data/
Original file line number Diff line number Diff line change
Expand Up @@ -23,42 +23,67 @@
package org.sindice.siren.analysis.filter;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.sindice.siren.analysis.TupleTokenizer;

/**
* Filter that removes the trailing slash to token of type
* {@code TupleTokenizer.URI}
* {@link TupleTokenizer.URI} or {@link UAX29URLEmailTokenizer.URL_TYPE}
*/
public class URITrailingSlashFilter extends TokenFilter {

private final String tokenType;
private Set<String> tokenTypes = new HashSet<String>();
private final static Set<String> DEFAULT_TOKEN_TYPES = new HashSet<String>();

private final CharTermAttribute termAtt;
private final TypeAttribute typeAtt;

// by default, check the token type
public static final boolean DEFAULT_CHECKTYPE = true;
private boolean checkType = DEFAULT_CHECKTYPE;

static {
DEFAULT_TOKEN_TYPES.add(TupleTokenizer.getTokenTypes()[TupleTokenizer.URI]);
DEFAULT_TOKEN_TYPES.add(UAX29URLEmailTokenizer.URL_TYPE);
}

public URITrailingSlashFilter(final TokenStream in) {
this(in, TupleTokenizer.getTokenTypes()[TupleTokenizer.URI]);
this(in, new HashSet<String>(DEFAULT_TOKEN_TYPES));
}

public URITrailingSlashFilter(final TokenStream in, final String tokenType) {
super(in);
this.tokenType = tokenType;
this.tokenTypes.add(tokenType);
termAtt = this.addAttribute(CharTermAttribute.class);
typeAtt = this.addAttribute(TypeAttribute.class);
}

public URITrailingSlashFilter(final TokenStream in, final Set<String> tokenTypes) {
super(in);
this.tokenTypes = tokenTypes;
termAtt = this.addAttribute(CharTermAttribute.class);
typeAtt = this.addAttribute(TypeAttribute.class);
}

public void setCheckTokenType(final boolean checkType) {
this.checkType = checkType;
}

@Override
public final boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
}

final String type = typeAtt.type();
if (type.equals(tokenType)) {
if (checkType ? tokenTypes.contains(type) : true) {
final int bufferLength = termAtt.length();
// Remove trailing slash
if (termAtt.buffer()[bufferLength - 1] == '/') {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,12 @@ public Query parse(final String query) throws ParseException {
}
}

// TODO: does not support mailto: uri
static String uriRegExp = "(news|(ht|f)tp(s?))\\://[a-zA-Z0-9\\-\\.]+\\.[a-zA-Z]{2,3}(/\\S*)?";
static Pattern pattern = Pattern.compile(uriRegExp);

// TODO: check if other special characters of lucene can appear in a URI, and
// escape them
// escape them, for example ~.
private String escapeURIs(final String query) {
final Matcher matcher = pattern.matcher(query);

Expand All @@ -96,7 +97,7 @@ public static String escape(final String s) {
for (int i = 0; i < s.length(); i++) {
final char c = s.charAt(i);
// These characters are part of the field query syntax and must be escaped
if (c == ':' || c == '\\') {
if (c == ':' || c == '\\' || c == '~') {
sb.append('\\');
}
sb.append(c);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public KeywordQueryConfigHandler() {
this.addAttribute(MultiTermRewriteMethodAttribute.class);
this.addAttribute(AllowFuzzyAndWildcardAttribute.class);
this.addAttribute(AllowLeadingWildcardAttribute.class);
this.addAttribute(PositionIncrementsAttribute.class);
this.addAttribute(PositionIncrementsAttribute.class).setPositionIncrementsEnabled(true);
this.addAttribute(LocaleAttribute.class);
this.addAttribute(DefaultPhraseSlopAttribute.class);
this.addAttribute(MultiTermRewriteMethodAttribute.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
import org.apache.lucene.queryParser.standard.config.AnalyzerAttribute;
import org.apache.lucene.queryParser.standard.config.PositionIncrementsAttribute;
import org.apache.lucene.queryParser.standard.nodes.MultiPhraseQueryNode;
import org.apache.lucene.queryParser.standard.nodes.StandardBooleanQueryNode;
import org.apache.lucene.queryParser.standard.nodes.WildcardQueryNode;

/**
Expand All @@ -63,11 +62,14 @@
* If the analyzer return only one term, the returned term is set to the
* {@link FieldQueryNode} and it's returned. <br/>
* <br/>
* If the analyzer return more than one term, a {@link OrQueryNode},
* If the analyzer return more than one term
* {@link TokenizedPhraseQueryNode} or {@link MultiPhraseQueryNode} is created,
* whether there is one or more
* terms at the same position, and it's returned. <br/>
* <br/>
* A {@link OrQueryNode} can be returned if query expansion is detected, i.e.,
* more than one term at the same position.
* <br/>
* If no term is returned by the analyzer a {@link NoTokenFoundQueryNode} object
* is returned. <br/>
* <br/>
Expand Down Expand Up @@ -183,7 +185,8 @@ protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeExcept
if (numTokens == 0) {
return new NoTokenFoundQueryNode();

} else if (numTokens == 1) {
}
else if (numTokens == 1) {
String term = null;
try {
boolean hasNext;
Expand All @@ -199,13 +202,18 @@ protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeExcept

return fieldNode;

} else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
}
else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
// no phrase query:
final LinkedList<QueryNode> children = new LinkedList<QueryNode>();

int position = -1;

for (int i = 0; i < numTokens; i++) {
String term = null;
final int positionIncrement = 1;

try {
final boolean hasNext = buffer.incrementToken();
assert hasNext == true;
Expand All @@ -215,21 +223,31 @@ protected QueryNode postProcessNode(final QueryNode node) throws QueryNodeExcept
// safe to ignore, because we know the number of tokens
}

children.add(new FieldQueryNode(field, term, -1, -1));
final FieldQueryNode newFieldNode = new FieldQueryNode(field, term, -1, -1);

if (this.positionIncrementsEnabled) {
position += positionIncrement;
newFieldNode.setPositionIncrement(position);
} else {
newFieldNode.setPositionIncrement(i);
}

children.add(new FieldQueryNode(field, term, -1, -1));
}

// If multiple terms at one single position, this must be a query
// expansion. Perform a OR between the terms.
if (severalTokensAtSamePosition && positionCount == 1) {
return new GroupQueryNode(new OrQueryNode(children));
}
else if (positionCount == 1) {
return new GroupQueryNode(
new StandardBooleanQueryNode(children, true));
}
// if several tokens at same position && position count > 1, then
// results can be unexpected
else {
return new StandardBooleanQueryNode(children, false);
final TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();
for (int i = 0; i < children.size(); i++) {
pq.add(children.get(i));
}
return pq;
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,32 @@
*/
package org.sindice.siren.solr.analysis;

import java.util.Map;

import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenFilterFactory;
import org.sindice.siren.analysis.filter.URITrailingSlashFilter;

public class URITrailingSlashFilterFactory
extends BaseTokenFilterFactory {

public static final String CHECKTYPE_KEY = "checkTokenType";

private boolean checkType = true;

@Override
public void init(final Map<String,String> args) {
super.init(args);
this.assureMatchVersion();
final String check = args.get(CHECKTYPE_KEY);
checkType = (check != null ? Boolean.parseBoolean(check) : URITrailingSlashFilter.DEFAULT_CHECKTYPE);
}

@Override
public TokenStream create(final TokenStream input) {
return new URITrailingSlashFilter(input);
final URITrailingSlashFilter filter = new URITrailingSlashFilter(input);
filter.setCheckTokenType(checkType);
return filter;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -263,4 +263,40 @@ public void testASCIIFoldingExpansion() throws IOException, SolrServerException
Assert.assertTrue("id2 should get higher score than id1", score1 < score2);
}

@Test
public void testMailto() throws IOException, SolrServerException {
this.addNTripleString("id1", "<http://s> <http://p> <mailto:[email protected]> .");
SolrQuery query = new SolrQuery();
query.setQuery("mailto:[email protected]");
query.setQueryType("siren");

String[] results = wrapper.search(query, "url");
assertEquals(1, results.length);

query = new SolrQuery();
query.setQuery("[email protected]");
query.setQueryType("siren");

results = wrapper.search(query, "url");
assertEquals(1, results.length);
}

@Test
public void testTildeInURI() throws IOException, SolrServerException {
this.addNTripleString("id1", "<http://s> <http://p> <http://sw.deri.org/~aidanh/> .");
SolrQuery query = new SolrQuery();
query.setQuery("http://sw.deri.org/~aidanh/");
query.setQueryType("siren");

String[] results = wrapper.search(query, "url");
assertEquals(1, results.length);

query = new SolrQuery();
query.setQuery("http://sw.deri.org/~aidanh");
query.setQueryType("siren");

results = wrapper.search(query, "url");
assertEquals(1, results.length);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
-->
<fieldType name="ntriple-uri" class="org.apache.solr.schema.SubTextField">
<analyzer type="query">
<!-- whitespace tokenizer to not tokenize URI -->
<tokenizer class="solr.WhitespaceTokenizerFactory"/>

<!-- Remove trailing slash of URIs. -->
Expand Down Expand Up @@ -130,7 +131,12 @@
-->
<fieldType name="ntriple-keyword" class="org.apache.solr.schema.SubTextField">
<analyzer type="query">
<!-- whitespace tokenizer to not tokenize URI -->
<tokenizer class="solr.WhitespaceTokenizerFactory"/>

<!-- Remove trailing slash of URIs. -->
<filter class="org.sindice.siren.solr.analysis.URITrailingSlashFilterFactory"
checkTokenType="false"/>

<!-- Filters out those tokens *not* having length min through max
inclusive. -->
Expand Down
9 changes: 5 additions & 4 deletions siren-solr/src/test/resources/solr.home/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,8 @@

</analyzer>
<analyzer type="query">
<!-- UAX29URLEmailTokenizer for query in order to recognise URI,
email, and QName.
-->
<tokenizer class="solr.UAX29URLEmailTokenizerFactory"/>
<!-- whitespace tokenizer to not tokenize URI -->
<tokenizer class="solr.WhitespaceTokenizerFactory"/>

<!-- Filters out those tokens *not* having length min through max
inclusive. -->
Expand Down Expand Up @@ -155,6 +153,9 @@
<!-- Remove trailing slash of URIs. -->
<filter class="org.sindice.siren.solr.analysis.URITrailingSlashFilterFactory"/>

<!-- Tokenize mailto URI -->
<filter class="org.sindice.siren.solr.analysis.MailtoFilterFactory"/>

<!-- Filters out those tokens *not* having length min through max
inclusive. -->
<filter class="solr.LengthFilterFactory" min="2" max="256"/>
Expand Down

0 comments on commit 996b945

Please sign in to comment.