Skip to content

Commit

Permalink
Merge branch 'wip-0.2.2' of github.com:rdelbru/SIREn into wip-0.2.2
Browse files Browse the repository at this point in the history
  • Loading branch information
rdelbru committed Nov 11, 2011
2 parents 8d3bb27 + 864c069 commit cb8cbff
Show file tree
Hide file tree
Showing 36 changed files with 562 additions and 176 deletions.
10 changes: 8 additions & 2 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

======================= Release 0.2.2 2011-xx-xx =======================
======================= Release 0.2.2-RC1 2011-11-07 =======================

New Features

Expand All @@ -12,10 +12,16 @@ Improvements

* [GH-22] Upgrade to Lucene/Solr 3.4

Bugs

* [SRN-116] Question mark not correctly escaped in keyword search
* [SRN-117] Phrase Query problem with uri local name

======================= Release 0.2.1 2011-09-13 =======================

Bugs

* [GH-15] siren-core - Fixed POM issue caused by the caliper dependency on gson
* [SRN-25] Counting of matchers in SirenDisjunctionScorer is wrong
* [SRN-79] Make URINormalisationFilter less agressive: token length minimum
limit is now set to 3.
Expand Down Expand Up @@ -46,7 +52,7 @@ Improvements
* [SRN-100] Improve user-readable version of the query
* [SRN-102] Catch SolrException in SIREnQParser#parse
* [SRN-105] Upgrade to Lucene/Solr 3.1
* []SRN-108] Improved support of encoded character in URI
* [SRN-108] Improved support of encoded character in URI
* [SRN-112] Tokenisation problem with URI localnames

* [GH-1] Pom fixes
Expand Down
35 changes: 32 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>

<groupId>org.sindice.siren</groupId>
<artifactId>siren-aggregator</artifactId>
<version>1-SNAPSHOT</version>
<version>0.2.2-RC2-SNAPSHOT</version>
<packaging>pom</packaging>
<name>siren-aggregator</name>

Expand All @@ -17,4 +16,34 @@
<module>siren-qparser</module>
</modules>

<scm>
<connection>scm:git:ssh://[email protected]/rdelbru/SIREn.git</connection>
<developerConnection>scm:git:ssh://[email protected]/rdelbru/SIREn.git</developerConnection>
<url>git:ssh://[email protected]/rdelbru/SIREn.git</url>
</scm>

<!-- To avoid an error when tagging -->
<build>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.2.1</version>
</plugin>
</plugins>
</pluginManagement>
</build>

<!-- distributionManagement>
<repository>
<id>repo</id>
<url>https://github.com/rdelbru/maven-repository/raw/master/releases</url>
</repository>
<snapshotRepository>
<id>snapshot-repo</id>
<url>https://github.com/rdelbru/maven-repository/raw/master/snapshots</url>
</snapshotRepository>
</distributionManagement -->

</project>
24 changes: 12 additions & 12 deletions siren-core/pom.xml
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<artifactId>siren-parent</artifactId>
<groupId>org.sindice.siren</groupId>
<version>0.2.2-SNAPSHOT</version>
<version>0.2.2-RC2-SNAPSHOT</version>
<relativePath>../siren-parent</relativePath>
</parent>

<groupId>org.sindice.siren</groupId>
<artifactId>siren-core</artifactId>
<version>0.2.2-SNAPSHOT</version>
<version>0.2.2-RC2-SNAPSHOT</version>
<name>siren-core</name>
<url>http://siren.sindice.org</url>

Expand Down Expand Up @@ -89,16 +88,17 @@
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
<version>${lucene.version}</version>
<scope>test</scope>
<scope>test</scope>
</dependency>

<dependency>
<groupId>com.google.code.caliper</groupId>
<artifactId>caliper</artifactId>
<version>1.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>com.google.code.caliper</groupId>
<artifactId>caliper</artifactId>
<version>1.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>

<!-- Because of a dependency issue in caliper -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,19 @@
import org.sindice.siren.analysis.filter.URITrailingSlashFilter;

/**
* Analyzer designed to deal with any kind of URIs.
* Analyzer designed to deal with any kind of URIs and perform some post-processing
* on URIs.
* <br>
* The URI normalisation can be configured. You can disable it, activate it
* only on URI local name, or on the full URI. However, URI normalisation on the
* full URI is costly in term of CPU at indexing time, and can double the size
* of the index, since each URI is duplicated by n tokens.
* <br>
* By default, the URI normalisation is disabled.
* <br>
* When full uri normalisation is activated, the analyzer is much slower than
* the WhitespaceTupleAnalyzer. If you are not indexing RDF data, consider to
* use the WhitespaceTupleAnalyzer instead.
*/
public class AnyURIAnalyzer extends Analyzer {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,6 @@ public final TokenStream reusableTokenStream(final String fieldName, final Reade
streams.tokenStream = new SirenNumericTokenStream(precisionStep);
this.setNumericValue(streams.tokenStream, reader);
} else {
// streams.tokenStream.reset();
/*
* Calling reset would throw an exception if a previous call to this stream
* failed somehow.
* Anyway, the reset just reset values which are assigned later by the set***Value method
*/
this.setNumericValue(streams.tokenStream, reader);
}
return streams.tokenStream;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,13 @@
import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
import org.sindice.siren.analysis.filter.DatatypeAnalyzerFilter;
import org.sindice.siren.analysis.filter.SirenDeltaPayloadFilter;
import org.sindice.siren.analysis.filter.TokenTypeFilter;
import org.sindice.siren.analysis.filter.DatatypeAnalyzerFilter;

/**
* The TupleAnalyzer is especially designed to process RDF data. It applies
* various post-processing on URIs and Literals.
* <br>
* The URI normalisation can be configured. You can disable it, activate it
* only on URI local name, or on the full URI. However, URI normalisation on the
* full URI is costly in term of CPU at indexing time, and can double the size
* of the index, since each URI is duplicated by n tokens.
* <br>
* By default, the URI normalisation is disabled.
* <br>
* When full uri normalisation is activated, the analyzer is much slower than
* the WhitespaceTupleAnalyzer. If you are not indexing RDF data, consider to
* use the WhitespaceTupleAnalyzer instead.
*/
public class TupleAnalyzer extends Analyzer {

Expand All @@ -62,6 +51,12 @@ public class TupleAnalyzer extends Analyzer {

private final CharArrayMap<Analyzer> regLitAnalyzers;

/**
* Create a {@link TupleAnalyzer} with the default {@link Analyzer} for Literals and URIs.
* @param version
* @param stringAnalyzer default Literal {@link Analyzer}
* @param anyURIAnalyzer default URI {@link Analyzer}
*/
public TupleAnalyzer(Version version, final Analyzer stringAnalyzer, final Analyzer anyURIAnalyzer) {
matchVersion = version;
this.stringAnalyzer = stringAnalyzer;
Expand All @@ -78,12 +73,21 @@ public void setAnyURIAnalyzer(final Analyzer analyzer) {
anyURIAnalyzer = analyzer;
}

/**
* Assign an {@link Analyzer} to be used with that key. That analyzer is used
* to process tokens outputed from the {@link TupleTokenizer}.
* @param datatype
* @param a
*/
public void registerLiteralAnalyzer(char[] datatype, Analyzer a) {
if (!regLitAnalyzers.containsKey(datatype)) {
regLitAnalyzers.put(datatype, a);
}
}

/**
* Remove all registered {@link Analyzer}s.
*/
public void clearRegisterLiteralAnalyzers() {
regLitAnalyzers.clear();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

/**
* A grammar-based tokenizer constructed with JFlex for N-Tuples. Splits a
* N-Tuple into BNode, URI, Literal, Dot tokens.
* N-Tuple into BNode, URI, Literal and Dot tokens.
*/
public class TupleTokenizer
extends Tokenizer {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
import org.sindice.siren.analysis.filter.AssignTokenTypeFilter;

/**
* Analyzer designed to deal with any kind of URIs.
* Analyzer designed to deal with any kind of URIs. It does not perform any
* post-processing on URIs. Only the {@link LowerCaseFilter} is used.
*/
public class WhitespaceAnyURIAnalyzer extends Analyzer {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public class DatatypeAnalyzerFilter extends TokenFilter {

private final CharArrayMap<Analyzer> dtsAnalyzer;

//
private CharTermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posIncrAtt;
Expand All @@ -85,6 +86,14 @@ public DatatypeAnalyzerFilter(final Version version,
this.initAttributes();
}

/**
* Create a {@link DatatypeAnalyzerFilter} with the given default {@link Analyzer}
* for URI an Literal.
* @param version The Lucene version to use
* @param input the input token stream
* @param stringAnalyzer the default Literal {@link Analyzer}
* @param anyURIAnalyzer the default URI {@link Analyzer}
*/
public DatatypeAnalyzerFilter(final Version version,
final TokenStream input,
final Analyzer stringAnalyzer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ public final boolean incrementToken() throws java.io.IOException {

// While we are normalising the URI
if (_isNormalising) {
this.posIncrAtt.setPositionIncrement(1); // reset the position increment
this.nextToken();
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ public final boolean incrementToken() throws java.io.IOException {

// While we are normalising the URI
if (_isNormalising) {
this.posIncrAtt.setPositionIncrement(1); // reset the position increment
this.nextToken();
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopTermsRewrite;

/**
*
* Code taken from {@link TopTermsRewrite} in order to use {@link SirenMultiTermQuery}.
*/
public abstract class SirenTopTermsRewrite<Q extends Query> extends SirenTermCollectingRewrite<Q> {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
import org.sindice.siren.analysis.AnyURIAnalyzer.URINormalisation;
import org.sindice.siren.analysis.attributes.CellAttribute;
import org.sindice.siren.analysis.attributes.TupleAttribute;
import org.sindice.siren.analysis.filter.URILocalnameFilter;
import org.sindice.siren.analysis.filter.URINormalisationFilter;

public class TestTupleAnalyzer {

Expand Down Expand Up @@ -82,7 +84,7 @@ public void assertAnalyzesTo(final Analyzer a, final String input,
final String[] expectedTypes,
final int[] expectedPosIncrs)
throws Exception {
this.assertAnalyzesTo(a, input, expectedImages, expectedTypes, null, null,
this.assertAnalyzesTo(a, input, expectedImages, expectedTypes, expectedPosIncrs, null,
null);
}

Expand Down Expand Up @@ -150,6 +152,46 @@ public void assertAnalyzesTo(final Analyzer a, final String input,
t.close();
}

/**
* Test the local URINormalisation: the word "the" is a stop word, hence it is
* filtered. The position increment is updated accordingly, but it is not reset for
* future calls. Corrects issue SRN-117.
* @throws Exception
*/
@Test
public void testURINormalisation()
throws Exception {
final AnyURIAnalyzer uriAnalyzer = new AnyURIAnalyzer(Version.LUCENE_34);
uriAnalyzer.setUriNormalisation(URINormalisation.LOCALNAME);
_a = new TupleAnalyzer(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31), uriAnalyzer);

this.assertAnalyzesTo(_a, "<http://dbpedia.org/resource/The_Kingston_Trio>",
new String[] { "kingston", "trio", "the_kingston_trio",
"http://dbpedia.org/resource/the_kingston_trio" },
new String[] { "<URI>", "<URI>", "<URI>", "<URI>" },
new int[] { 2, 1, 0, 0 });
}

/**
* The same, with Full normalisation -- the stop word is now "their" because in
* {@link URINormalisationFilter}, there is inside a filter of words smaller
* than 4 (it was 3 for {@link URILocalnameFilter}.
* @throws Exception
*/
@Test
public void testURINormalisation2()
throws Exception {
final AnyURIAnalyzer uriAnalyzer = new AnyURIAnalyzer(Version.LUCENE_34);
uriAnalyzer.setUriNormalisation(URINormalisation.FULL);
_a = new TupleAnalyzer(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31), uriAnalyzer);

this.assertAnalyzesTo(_a, "<http://dbpedia.org/resource/their_Kingston_Trio>",
new String[] { "dbpedia", "resource", "kingston", "trio",
"http://dbpedia.org/resource/their_kingston_trio" },
new String[] { "<URI>", "<URI>", "<URI>", "<URI>", "<URI>" },
new int[] { 1, 1, 2, 1, 0 });
}

@Test
public void testURI()
throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public void assertNormalisesTo(final Tokenizer t, final String input,
final String[] expectedTypes,
final int[] expectedPosIncrs)
throws Exception {
this.assertNormalisesTo(t, input, expectedImages, expectedTypes, null, null,
this.assertNormalisesTo(t, input, expectedImages, expectedTypes, expectedPosIncrs, null,
null);
}

Expand Down Expand Up @@ -153,6 +153,21 @@ public void testURI()
new String[] { "uppercase", "Should", "Tokenised", "uppercaseShouldBeTokenised", "http://renaud.delbru.fr/rdf/uppercaseShouldBeTokenised" });
this.assertNormalisesTo(_t, "<http://renaud.delbru.fr/rdf/AVeryLongLocalnameWithMoreThan64CharactersThatShouldNotBeTokenised>",
new String[] { "AVeryLongLocalnameWithMoreThan64CharactersThatShouldNotBeTokenised", "http://renaud.delbru.fr/rdf/AVeryLongLocalnameWithMoreThan64CharactersThatShouldNotBeTokenised" });


final String triple = "<http://dbpedia.org/resource/The_Kingston_Trio> " +
"<http://purl.org/dc/terms/subject> " +
"<http://dbpedia.org/resource/Category:Decca_Records_artists>";
this.assertNormalisesTo(_t, triple,
new String[] { "The", "Kingston", "Trio", "The_Kingston_Trio", "http://dbpedia.org/resource/The_Kingston_Trio",
"subject", "http://purl.org/dc/terms/subject",
"Category", "Decca", "Records", "artists", "Category:Decca_Records_artists", "http://dbpedia.org/resource/Category:Decca_Records_artists" },
new String[] { "<URI>", "<URI>", "<URI>", "<URI>", "<URI>",
"<URI>", "<URI>",
"<URI>", "<URI>", "<URI>", "<URI>", "<URI>", "<URI>" },
new int[] { 1, 1, 1, 0, 0,
1, 0,
1, 1, 1, 1, 0, 0 });
}

@Test
Expand Down
Loading

0 comments on commit cb8cbff

Please sign in to comment.