Skip to content

LUCENE-8682: remove deprecated WordDelimiterFilter[Factory] classes #200

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
</analyzer>
</fieldType>

<!-- adds a WordDelimiterFilter producing stacked/synonym tokens -->
<!-- adds a WordDelimiterGraphFilter producing stacked/synonym tokens -->
<fieldType name="tagWDF" class="solr.TextField" positionIncrementGap="100"
postingsFormat="FST50" omitTermFreqAndPositions="true" omitNorms="true">
<analyzer type="index">
Expand Down
4 changes: 2 additions & 2 deletions solr/core/src/test-files/solr/collection1/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -205,15 +205,15 @@
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" expand="true"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="1" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" expand="true"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="1" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory;
import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.Version;
import org.apache.solr.SolrTestCaseJ4;
Expand All @@ -30,10 +30,10 @@
import org.junit.Test;

/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
* New WordDelimiterGraphFilter tests... most of the tests are in ConvertedLegacyTest
*/
// TODO: add a low-level test for this factory
public class TestWordDelimiterFilterFactory extends SolrTestCaseJ4 {
public class TestWordDelimiterGraphFilterFactory extends SolrTestCaseJ4 {

@BeforeClass
public static void beforeClass() throws Exception {
Expand Down Expand Up @@ -123,24 +123,6 @@ public void testPreserveOrignalTrue() {
clearIndex();
}

/*
public void testPerformance() throws IOException {
String s = "now is the time-for all good men to come to-the aid of their country.";
Token tok = new Token();
long start = System.currentTimeMillis();
int ret=0;
for (int i=0; i<1000000; i++) {
StringReader r = new StringReader(s);
TokenStream ts = new WhitespaceTokenizer(r);
ts = new WordDelimiterFilter(ts, 1,1,1,1,0);

while (ts.next(tok) != null) ret++;
}

System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
}
***/

@Test
public void testAlphaNumericWords(){
assertU(adoc("id", "68","numericsubword","Java/J2SE"));
Expand Down Expand Up @@ -207,16 +189,16 @@ public void testCustomTypes() throws Exception {
args.put("splitOnCaseChange", "1");

/* default behavior */
WordDelimiterFilterFactory factoryDefault = new WordDelimiterFilterFactory(args);
WordDelimiterGraphFilterFactory factoryDefault = new WordDelimiterGraphFilterFactory(args);
factoryDefault.inform(loader);

TokenStream ts = factoryDefault.create(whitespaceMockTokenizer(testText));
BaseTokenStreamTestCase.assertTokenStreamContents(ts,
new String[] { "I", "borrowed", "5", "540000", "400", "00", "at", "25", "interest", "interestrate", "rate" });
new String[] { "I", "borrowed", "540000", "5", "400", "00", "at", "25", "interestrate", "interest", "rate" });
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Slightly concerning here that we are dependent on order from something that I presume is not order-defined.


ts = factoryDefault.create(whitespaceMockTokenizer("foo\u200Dbar"));
BaseTokenStreamTestCase.assertTokenStreamContents(ts,
new String[] { "foo", "foobar", "bar" });
new String[] { "foobar", "foo", "bar" });


/* custom behavior */
Expand All @@ -230,12 +212,12 @@ public void testCustomTypes() throws Exception {
args.put("catenateAll", "0");
args.put("splitOnCaseChange", "1");
args.put("types", "wdftypes.txt");
WordDelimiterFilterFactory factoryCustom = new WordDelimiterFilterFactory(args);
WordDelimiterGraphFilterFactory factoryCustom = new WordDelimiterGraphFilterFactory(args);
factoryCustom.inform(loader);

ts = factoryCustom.create(whitespaceMockTokenizer(testText));
BaseTokenStreamTestCase.assertTokenStreamContents(ts,
new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "interestrate", "rate" });
new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interestrate", "interest", "rate" });

/* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */
ts = factoryCustom.create(whitespaceMockTokenizer("foo\u200Dbar"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ public void testCollationWithHypens() throws Exception
List<String> collations = collationHolder.getAll("collation");
assertTrue(collations.size()==1);
String collation = collations.iterator().next();
assertTrue("Incorrect collation: " + collation,"lowerfilt:(hyphenated-word)".equals(collation));
assertTrue("Incorrect collation: " + collation,"lowerfilt:(hyphenword)".equals(collation));
}

params.remove(CommonParams.Q);
Expand All @@ -180,7 +180,7 @@ public void testCollationWithHypens() throws Exception
List<String> collations = collationHolder.getAll("collation");
assertTrue(collations.size()==1);
String collation = collations.iterator().next();
assertTrue("Incorrect collation: " + collation,"hyphenated-word".equals(collation));
assertTrue("Incorrect collation: " + collation,"hyphenword".equals(collation));
}

}
Expand Down
14 changes: 0 additions & 14 deletions solr/solr-ref-guide/src/filters.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -2875,20 +2875,6 @@ If *false*, or undefined, the file defined in `types` is used as a blacklist.
====
--

== Word Delimiter Filter

This filter splits tokens at word delimiters.

.Word Delimiter Filter has been Deprecated
[WARNING]
====
Word Delimiter Filter has been deprecated in favor of Word Delimiter Graph Filter, which is required to produce a correct token graph so that e.g., phrase queries can work correctly.
====

*Factory class:* `solr.WordDelimiterFilterFactory`

For a full description, including arguments and examples, see the Word Delimiter Graph Filter below.

== Word Delimiter Graph Filter

This filter splits tokens at word delimiters.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ public boolean isTokenized() {
boolean={fields=[inStock],tokenized=false,analyzer=org.apache.solr.schema.BoolField$1@354949},
textTight={fields=[sku],tokenized=true,analyzer=TokenizerChain(org.apache.solr.analysis.WhitespaceTokenizerFactory@5e88f7,
org.apache.solr.analysis.SynonymFilterFactory@723646, org.apache.solr.analysis.StopFilterFactory@492ff1,
org.apache.solr.analysis.WordDelimiterFilterFactory@eaabad, org.apache.solr.analysis.LowerCaseFilterFactory@ad1355,
org.apache.solr.analysis.WordDelimiterGraphFilterFactory@eaabad, org.apache.solr.analysis.LowerCaseFilterFactory@ad1355,
org.apache.solr.analysis.EnglishPorterFilterFactory@d03a00, org.apache.solr.analysis.RemoveDuplicatesTokenFilterFactory@900079)},
long={fields=null,tokenized=false,analyzer=org.apache.solr.schema.FieldType$DefaultAnalyzer@f3b83},
double={fields=null,tokenized=false,analyzer=org.apache.solr.schema.FieldType$DefaultAnalyzer@c2b07},
Expand Down