diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java index caab47f58257..5ab33e6a00c1 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java @@ -35,7 +35,7 @@ * <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> * <filter class="solr.StopFilterFactory" ignoreCase="true"/> * <filter class="solr.HyphenatedWordsFilterFactory"/> - * <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> + * <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> * <filter class="solr.LowerCaseFilterFactory"/> * <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> * </analyzer> @@ -43,7 +43,7 @@ * <tokenizer class="solr.WhitespaceTokenizerFactory"/> * <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> * <filter class="solr.StopFilterFactory" ignoreCase="true"/> - * <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/> + * <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/> * <filter class="solr.LowerCaseFilterFactory"/> * <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> * </analyzer> diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java deleted file mode 100644 index 742f09821a90..000000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java +++ /dev/null @@ -1,674 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.miscellaneous; - -import java.io.IOException; -import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.search.PhraseQuery; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.InPlaceMergeSorter; - -/** - * Splits words into subwords and performs optional transformations on subword groups. Words are - * split into subwords with the following rules: - * - * - * - * The GENERATE... options affect how incoming tokens are broken into parts, and the various - * CATENATE_... parameters affect how those parts are combined. - * - * - * - * One use for {@link WordDelimiterFilter} is to help match words with different subword delimiters. - * For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi" - * queries to all match. One way of doing so is to specify CATENATE options in the analyzer used for - * indexing, and not in the analyzer used for querying. Given that the current {@link - * StandardTokenizer} immediately removes many intra-word delimiters, it is recommended that this - * filter be used after a tokenizer that does not do this (such as {@link WhitespaceTokenizer}). - * - * @deprecated Use {@link WordDelimiterGraphFilter} instead: it produces a correct token graph so - * that e.g. {@link PhraseQuery} works correctly when it's used in the search time analyzer. - */ -@Deprecated -public final class WordDelimiterFilter extends TokenFilter { - - public static final int LOWER = 0x01; - public static final int UPPER = 0x02; - public static final int DIGIT = 0x04; - public static final int SUBWORD_DELIM = 0x08; - - // combinations: for testing, not for setting bits - public static final int ALPHA = 0x03; - public static final int ALPHANUM = 0x07; - - /** - * Causes parts of words to be generated: - * - *

"PowerShot" => "Power" "Shot" - */ - public static final int GENERATE_WORD_PARTS = 1; - - /** - * Causes number subwords to be generated: - * - *

"500-42" => "500" "42" - */ - public static final int GENERATE_NUMBER_PARTS = 2; - - /** - * Causes maximum runs of word parts to be catenated: - * - *

"wi-fi" => "wifi" - */ - public static final int CATENATE_WORDS = 4; - - /** - * Causes maximum runs of word parts to be catenated: - * - *

"500-42" => "50042" - */ - public static final int CATENATE_NUMBERS = 8; - - /** - * Causes all subword parts to be catenated: - * - *

"wi-fi-4000" => "wifi4000" - */ - public static final int CATENATE_ALL = 16; - - /** - * Causes original words are preserved and added to the subword list (Defaults to false) - * - *

"500-42" => "500" "42" "500-42" - */ - public static final int PRESERVE_ORIGINAL = 32; - - /** - * If not set, causes case changes to be ignored (subwords will only be generated given - * SUBWORD_DELIM tokens) - */ - public static final int SPLIT_ON_CASE_CHANGE = 64; - - /** - * If not set, causes numeric changes to be ignored (subwords will only be generated given - * SUBWORD_DELIM tokens). - */ - public static final int SPLIT_ON_NUMERICS = 128; - - /** - * Causes trailing "'s" to be removed for each subword - * - *

"O'Neil's" => "O", "Neil" - */ - public static final int STEM_ENGLISH_POSSESSIVE = 256; - - /** Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true. */ - public static final int IGNORE_KEYWORDS = 512; - - /** If not null is the set of tokens to protect from being delimited */ - final CharArraySet protWords; - - private final int flags; - - private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); - private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); - ; - private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); - private final PositionIncrementAttribute posIncAttribute = - addAttribute(PositionIncrementAttribute.class); - private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); - - // used for iterating word delimiter breaks - private final WordDelimiterIterator iterator; - - // used for concatenating runs of similar typed subwords (word,number) - private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation(); - // number of subwords last output by concat. - private int lastConcatCount = 0; - - // used for catenate all - private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation(); - - // used for accumulating position increment gaps - private int accumPosInc = 0; - - private char savedBuffer[] = new char[1024]; - private int savedStartOffset; - private int savedEndOffset; - private String savedType; - private boolean hasSavedState = false; - // if length by start + end offsets doesn't match the term text then assume - // this is a synonym and don't adjust the offsets. - private boolean hasIllegalOffsets = false; - - // for a run of the same subword type within a word, have we output anything? - private boolean hasOutputToken = false; - // when preserve original is on, have we output any token following it? - // this token must have posInc=0! - private boolean hasOutputFollowingOriginal = false; - - /** - * Creates a new WordDelimiterFilter - * - * @param in TokenStream to be filtered - * @param charTypeTable table containing character types - * @param configurationFlags Flags configuring the filter - * @param protWords If not null is the set of tokens to protect from being delimited - */ - public WordDelimiterFilter( - TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) { - super(in); - this.flags = configurationFlags; - this.protWords = protWords; - this.iterator = - new WordDelimiterIterator( - charTypeTable, - has(SPLIT_ON_CASE_CHANGE), - has(SPLIT_ON_NUMERICS), - has(STEM_ENGLISH_POSSESSIVE)); - } - - /** - * Creates a new WordDelimiterFilter using {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE} - * as its charTypeTable - * - * @param in TokenStream to be filtered - * @param configurationFlags Flags configuring the filter - * @param protWords If not null is the set of tokens to protect from being delimited - */ - public WordDelimiterFilter(TokenStream in, int configurationFlags, CharArraySet protWords) { - this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords); - } - - @Override - public boolean incrementToken() throws IOException { - while (true) { - if (!hasSavedState) { - // process a new input word - if (!input.incrementToken()) { - return false; - } - if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) { - return true; - } - int termLength = termAttribute.length(); - char[] termBuffer = termAttribute.buffer(); - - accumPosInc += posIncAttribute.getPositionIncrement(); - - iterator.setText(termBuffer, termLength); - iterator.next(); - - // word of no delimiters, or protected word: just return it - if ((iterator.current == 0 && iterator.end == termLength) - || (protWords != null && protWords.contains(termBuffer, 0, termLength))) { - posIncAttribute.setPositionIncrement(accumPosInc); - accumPosInc = 0; - first = false; - return true; - } - - // word of simply delimiters - if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL)) { - // if the posInc is 1, simply ignore it in the accumulation - // TODO: proper hole adjustment (FilteringTokenFilter-like) instead of this previous - // logic! - if (posIncAttribute.getPositionIncrement() == 1 && !first) { - accumPosInc--; - } - continue; - } - - saveState(); - - hasOutputToken = false; - hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL); - lastConcatCount = 0; - - if (has(PRESERVE_ORIGINAL)) { - posIncAttribute.setPositionIncrement(accumPosInc); - accumPosInc = 0; - first = false; - return true; - } - } - - // at the end of the string, output any concatenations - if (iterator.end == WordDelimiterIterator.DONE) { - if (!concat.isEmpty()) { - if (flushConcatenation(concat)) { - buffer(); - continue; - } - } - - if (!concatAll.isEmpty()) { - // only if we haven't output this same combo above! - if (concatAll.subwordCount > lastConcatCount) { - concatAll.writeAndClear(); - buffer(); - continue; - } - concatAll.clear(); - } - - if (bufferedPos < bufferedLen) { - if (bufferedPos == 0) { - sorter.sort(0, bufferedLen); - } - clearAttributes(); - restoreState(buffered[bufferedPos++]); - if (first && posIncAttribute.getPositionIncrement() == 0) { - // can easily happen with strange combinations (e.g. not outputting numbers, but - // concat-all) - posIncAttribute.setPositionIncrement(1); - } - first = false; - return true; - } - - // no saved concatenations, on to the next input word - bufferedPos = bufferedLen = 0; - hasSavedState = false; - continue; - } - - // word surrounded by delimiters: always output - if (iterator.isSingleWord()) { - generatePart(true); - iterator.next(); - first = false; - return true; - } - - int wordType = iterator.type(); - - // do we already have queued up incompatible concatenations? - if (!concat.isEmpty() && (concat.type & wordType) == 0) { - if (flushConcatenation(concat)) { - hasOutputToken = false; - buffer(); - continue; - } - hasOutputToken = false; - } - - // add subwords depending upon options - if (shouldConcatenate(wordType)) { - if (concat.isEmpty()) { - concat.type = wordType; - } - concatenate(concat); - } - - // add all subwords (catenateAll) - if (has(CATENATE_ALL)) { - concatenate(concatAll); - } - - // if we should output the word or number part - if (shouldGenerateParts(wordType)) { - generatePart(false); - buffer(); - } - - iterator.next(); - } - } - - @Override - public void reset() throws IOException { - super.reset(); - hasSavedState = false; - concat.clear(); - concatAll.clear(); - accumPosInc = bufferedPos = bufferedLen = 0; - first = true; - } - - private AttributeSource.State buffered[] = new AttributeSource.State[8]; - private int startOff[] = new int[8]; - private int posInc[] = new int[8]; - private int bufferedLen = 0; - private int bufferedPos = 0; - private boolean first; - - private class OffsetSorter extends InPlaceMergeSorter { - @Override - protected int compare(int i, int j) { - int cmp = Integer.compare(startOff[i], startOff[j]); - if (cmp == 0) { - cmp = Integer.compare(posInc[j], posInc[i]); - } - return cmp; - } - - @Override - protected void swap(int i, int j) { - AttributeSource.State tmp = buffered[i]; - buffered[i] = buffered[j]; - buffered[j] = tmp; - - int tmp2 = startOff[i]; - startOff[i] = startOff[j]; - startOff[j] = tmp2; - - tmp2 = posInc[i]; - posInc[i] = posInc[j]; - posInc[j] = tmp2; - } - } - - final OffsetSorter sorter = new OffsetSorter(); - - private void buffer() { - if (bufferedLen == buffered.length) { - int newSize = ArrayUtil.oversize(bufferedLen + 1, 8); - buffered = ArrayUtil.growExact(buffered, newSize); - startOff = ArrayUtil.growExact(startOff, newSize); - posInc = ArrayUtil.growExact(posInc, newSize); - } - startOff[bufferedLen] = offsetAttribute.startOffset(); - posInc[bufferedLen] = posIncAttribute.getPositionIncrement(); - buffered[bufferedLen] = captureState(); - bufferedLen++; - } - - /** Saves the existing attribute states */ - private void saveState() { - // otherwise, we have delimiters, save state - savedStartOffset = offsetAttribute.startOffset(); - savedEndOffset = offsetAttribute.endOffset(); - // if length by start + end offsets doesn't match the term text then assume this is a synonym - // and don't adjust the offsets. - hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length()); - savedType = typeAttribute.type(); - - if (savedBuffer.length < termAttribute.length()) { - savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), Character.BYTES)]; - } - - System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length()); - iterator.text = savedBuffer; - - hasSavedState = true; - } - - /** - * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or - * just clearing. - * - * @param concatenation WordDelimiterConcatenation that will be flushed - * @return {@code true} if the concatenation was written before it was cleared, {@code false} - * otherwise - */ - private boolean flushConcatenation(WordDelimiterConcatenation concatenation) { - lastConcatCount = concatenation.subwordCount; - if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) { - concatenation.writeAndClear(); - return true; - } - concatenation.clear(); - return false; - } - - /** - * Determines whether to concatenate a word or number if the current word is the given type - * - * @param wordType Type of the current word used to determine if it should be concatenated - * @return {@code true} if concatenation should occur, {@code false} otherwise - */ - private boolean shouldConcatenate(int wordType) { - return (has(CATENATE_WORDS) && isAlpha(wordType)) - || (has(CATENATE_NUMBERS) && isDigit(wordType)); - } - - /** - * Determines whether a word/number part should be generated for a word of the given type - * - * @param wordType Type of the word used to determine if a word/number part should be generated - * @return {@code true} if a word/number part should be generated, {@code false} otherwise - */ - private boolean shouldGenerateParts(int wordType) { - return (has(GENERATE_WORD_PARTS) && isAlpha(wordType)) - || (has(GENERATE_NUMBER_PARTS) && isDigit(wordType)); - } - - /** - * Concatenates the saved buffer to the given WordDelimiterConcatenation - * - * @param concatenation WordDelimiterConcatenation to concatenate the buffer to - */ - private void concatenate(WordDelimiterConcatenation concatenation) { - if (concatenation.isEmpty()) { - concatenation.startOffset = savedStartOffset + iterator.current; - } - concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current); - concatenation.endOffset = savedStartOffset + iterator.end; - } - - /** - * Generates a word/number part, updating the appropriate attributes - * - * @param isSingleWord {@code true} if the generation is occurring from a single word, {@code - * false} otherwise - */ - private void generatePart(boolean isSingleWord) { - clearAttributes(); - termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current); - int startOffset = savedStartOffset + iterator.current; - int endOffset = savedStartOffset + iterator.end; - - if (hasIllegalOffsets) { - // historically this filter did this regardless for 'isSingleWord', - // but we must do a sanity check: - if (isSingleWord && startOffset <= savedEndOffset) { - offsetAttribute.setOffset(startOffset, savedEndOffset); - } else { - offsetAttribute.setOffset(savedStartOffset, savedEndOffset); - } - } else { - offsetAttribute.setOffset(startOffset, endOffset); - } - posIncAttribute.setPositionIncrement(position(false)); - typeAttribute.setType(savedType); - } - - /** - * Get the position increment gap for a subword or concatenation - * - * @param inject true if this token wants to be injected - * @return position increment gap - */ - private int position(boolean inject) { - int posInc = accumPosInc; - - if (hasOutputToken) { - accumPosInc = 0; - return inject ? 0 : Math.max(1, posInc); - } - - hasOutputToken = true; - - if (!hasOutputFollowingOriginal) { - // the first token following the original is 0 regardless - hasOutputFollowingOriginal = true; - return 0; - } - // clear the accumulated position increment - accumPosInc = 0; - return Math.max(1, posInc); - } - - /** - * Checks if the given word type includes {@link #ALPHA} - * - * @param type Word type to check - * @return {@code true} if the type contains ALPHA, {@code false} otherwise - */ - static boolean isAlpha(int type) { - return (type & ALPHA) != 0; - } - - /** - * Checks if the given word type includes {@link #DIGIT} - * - * @param type Word type to check - * @return {@code true} if the type contains DIGIT, {@code false} otherwise - */ - static boolean isDigit(int type) { - return (type & DIGIT) != 0; - } - - /** - * Checks if the given word type includes {@link #SUBWORD_DELIM} - * - * @param type Word type to check - * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise - */ - static boolean isSubwordDelim(int type) { - return (type & SUBWORD_DELIM) != 0; - } - - /** - * Checks if the given word type includes {@link #UPPER} - * - * @param type Word type to check - * @return {@code true} if the type contains UPPER, {@code false} otherwise - */ - static boolean isUpper(int type) { - return (type & UPPER) != 0; - } - - /** - * Determines whether the given flag is set - * - * @param flag Flag to see if set - * @return {@code true} if flag is set - */ - private boolean has(int flag) { - return (flags & flag) != 0; - } - - /** A WDF concatenated 'run' */ - final class WordDelimiterConcatenation { - final StringBuilder buffer = new StringBuilder(); - int startOffset; - int endOffset; - int type; - int subwordCount; - - /** - * Appends the given text of the given length, to the concetenation at the given offset - * - * @param text Text to append - * @param offset Offset in the concetenation to add the text - * @param length Length of the text to append - */ - void append(char text[], int offset, int length) { - buffer.append(text, offset, length); - subwordCount++; - } - - /** Writes the concatenation to the attributes */ - void write() { - clearAttributes(); - if (termAttribute.length() < buffer.length()) { - termAttribute.resizeBuffer(buffer.length()); - } - char termbuffer[] = termAttribute.buffer(); - - buffer.getChars(0, buffer.length(), termbuffer, 0); - termAttribute.setLength(buffer.length()); - - if (hasIllegalOffsets) { - offsetAttribute.setOffset(savedStartOffset, savedEndOffset); - } else { - offsetAttribute.setOffset(startOffset, endOffset); - } - posIncAttribute.setPositionIncrement(position(true)); - typeAttribute.setType(savedType); - accumPosInc = 0; - } - - /** - * Determines if the concatenation is empty - * - * @return {@code true} if the concatenation is empty, {@code false} otherwise - */ - boolean isEmpty() { - return buffer.length() == 0; - } - - /** Clears the concatenation and resets its state */ - void clear() { - buffer.setLength(0); - startOffset = endOffset = type = subwordCount = 0; - } - - /** - * Convenience method for the common scenario of having to write the concetenation and then - * clearing its state - */ - void writeAndClear() { - write(); - clear(); - } - } - // questions: - // negative numbers? -42 indexed as just 42? - // dollar sign? $42 - // percent sign? 33% - // downsides: if source text is "powershot" then a query of "PowerShot" won't match! -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java deleted file mode 100644 index f513cab97192..000000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.miscellaneous; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenFilterFactory; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.search.PhraseQuery; -import org.apache.lucene.util.ResourceLoader; -import org.apache.lucene.util.ResourceLoaderAware; - -/** - * Factory for {@link WordDelimiterFilter}. - * - *

- * <fieldType name="text_wd" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.WordDelimiterFilterFactory" protected="protectedword.txt"
- *             preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
- *             catenateWords="0" catenateNumbers="0" catenateAll="0"
- *             generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
- *             types="wdfftypes.txt" />
- *   </analyzer>
- * </fieldType>
- * - * @deprecated Use {@link WordDelimiterGraphFilterFactory} instead: it produces a correct token - * graph so that e.g. {@link PhraseQuery} works correctly when it's used in the search time - * analyzer. - * @since 3.1 - * @lucene.spi {@value #NAME} - */ -@Deprecated -public class WordDelimiterFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - - private static final int CATENATE_ALL = WordDelimiterFilter.CATENATE_ALL; - private static final int CATENATE_NUMBERS = WordDelimiterFilter.CATENATE_NUMBERS; - private static final int CATENATE_WORDS = WordDelimiterFilter.CATENATE_WORDS; - private static final int GENERATE_NUMBER_PARTS = WordDelimiterFilter.GENERATE_NUMBER_PARTS; - private static final int GENERATE_WORD_PARTS = WordDelimiterFilter.GENERATE_WORD_PARTS; - private static final int PRESERVE_ORIGINAL = WordDelimiterFilter.PRESERVE_ORIGINAL; - private static final int SPLIT_ON_CASE_CHANGE = WordDelimiterFilter.SPLIT_ON_CASE_CHANGE; - private static final int SPLIT_ON_NUMERICS = WordDelimiterFilter.SPLIT_ON_NUMERICS; - private static final int STEM_ENGLISH_POSSESSIVE = WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; - private static final int ALPHA = WordDelimiterFilter.ALPHA; - private static final int ALPHANUM = WordDelimiterFilter.ALPHANUM; - private static final int DIGIT = WordDelimiterFilter.DIGIT; - private static final int LOWER = WordDelimiterFilter.LOWER; - private static final int SUBWORD_DELIM = WordDelimiterFilter.SUBWORD_DELIM; - private static final int UPPER = WordDelimiterFilter.UPPER; - - /** SPI name */ - public static final String NAME = "wordDelimiter"; - - public static final String PROTECTED_TOKENS = "protected"; - public static final String TYPES = "types"; - - private final String wordFiles; - private final String types; - private final int flags; - byte[] typeTable = null; - private CharArraySet protectedWords = null; - - /** Creates a new WordDelimiterFilterFactory */ - public WordDelimiterFilterFactory(Map args) { - super(args); - int flags = 0; - if (getInt(args, "generateWordParts", 1) != 0) { - flags |= GENERATE_WORD_PARTS; - } - if (getInt(args, "generateNumberParts", 1) != 0) { - flags |= GENERATE_NUMBER_PARTS; - } - if (getInt(args, "catenateWords", 0) != 0) { - flags |= CATENATE_WORDS; - } - if (getInt(args, "catenateNumbers", 0) != 0) { - flags |= CATENATE_NUMBERS; - } - if (getInt(args, "catenateAll", 0) != 0) { - flags |= CATENATE_ALL; - } - if (getInt(args, "splitOnCaseChange", 1) != 0) { - flags |= SPLIT_ON_CASE_CHANGE; - } - if (getInt(args, "splitOnNumerics", 1) != 0) { - flags |= SPLIT_ON_NUMERICS; - } - if (getInt(args, "preserveOriginal", 0) != 0) { - flags |= PRESERVE_ORIGINAL; - } - if (getInt(args, "stemEnglishPossessive", 1) != 0) { - flags |= STEM_ENGLISH_POSSESSIVE; - } - wordFiles = get(args, PROTECTED_TOKENS); - types = get(args, TYPES); - this.flags = flags; - if (!args.isEmpty()) { - throw new IllegalArgumentException("Unknown parameters: " + args); - } - } - - /** Default ctor for compatibility with SPI */ - public WordDelimiterFilterFactory() { - throw defaultCtorException(); - } - - @Override - public void inform(ResourceLoader loader) throws IOException { - if (wordFiles != null) { - protectedWords = getWordSet(loader, wordFiles, false); - } - if (types != null) { - List files = splitFileNames(types); - List wlist = new ArrayList<>(); - for (String file : files) { - List lines = getLines(loader, file.trim()); - wlist.addAll(lines); - } - typeTable = parseTypes(wlist); - } - } - - @Override - public TokenFilter create(TokenStream input) { - return new WordDelimiterFilter( - input, - typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, - flags, - protectedWords); - } - - // source => type - private static Pattern typePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$"); - - // parses a list of MappingCharFilter style rules into a custom byte[] type table - private byte[] parseTypes(List rules) { - SortedMap typeMap = new TreeMap<>(); - for (String rule : rules) { - Matcher m = typePattern.matcher(rule); - if (!m.find()) throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]"); - String lhs = parseString(m.group(1).trim()); - Byte rhs = parseType(m.group(2).trim()); - if (lhs.length() != 1) - throw new IllegalArgumentException( - "Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); - if (rhs == null) - throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); - typeMap.put(lhs.charAt(0), rhs); - } - - // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance - byte types[] = - new byte - [Math.max( - typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; - for (int i = 0; i < types.length; i++) types[i] = WordDelimiterIterator.getType(i); - for (Map.Entry mapping : typeMap.entrySet()) - types[mapping.getKey()] = mapping.getValue(); - return types; - } - - private Byte parseType(String s) { - if (s.equals("LOWER")) return LOWER; - else if (s.equals("UPPER")) return UPPER; - else if (s.equals("ALPHA")) return ALPHA; - else if (s.equals("DIGIT")) return DIGIT; - else if (s.equals("ALPHANUM")) return ALPHANUM; - else if (s.equals("SUBWORD_DELIM")) return SUBWORD_DELIM; - else return null; - } - - char[] out = new char[256]; - - private String parseString(String s) { - int readPos = 0; - int len = s.length(); - int writePos = 0; - while (readPos < len) { - char c = s.charAt(readPos++); - if (c == '\\') { - if (readPos >= len) - throw new IllegalArgumentException("Invalid escaped char in [" + s + "]"); - c = s.charAt(readPos++); - switch (c) { - case '\\': - c = '\\'; - break; - case 'n': - c = '\n'; - break; - case 't': - c = '\t'; - break; - case 'r': - c = '\r'; - break; - case 'b': - c = '\b'; - break; - case 'f': - c = '\f'; - break; - case 'u': - if (readPos + 3 >= len) - throw new IllegalArgumentException("Invalid escaped char in [" + s + "]"); - c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16); - readPos += 4; - break; - } - } - out[writePos++] = c; - } - return new String(out, 0, writePos); - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java index 8a66216f08ad..c390da7cf45f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java @@ -36,8 +36,8 @@ /** * Splits words into subwords and performs optional transformations on subword groups, producing a * correct token graph so that e.g. {@link PhraseQuery} can work correctly when this filter is used - * in the search-time analyzer. Unlike the deprecated {@link WordDelimiterFilter}, this token filter - * produces a correct token graph as output. However, it cannot consume an input token graph + * in the search-time analyzer. Unlike the now removed WordDelimiterFilter, this token + * filter produces a correct token graph as output. However, it cannot consume an input token graph * correctly. Processing is suppressed by {@link KeywordAttribute#isKeyword()}=true. * *

Words are split into subwords with the following rules: diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory index 2899fd516b96..44e48fd4dd61 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory @@ -87,7 +87,6 @@ org.apache.lucene.analysis.miscellaneous.ProtectedTermFilterFactory org.apache.lucene.analysis.miscellaneous.TrimFilterFactory org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory org.apache.lucene.analysis.miscellaneous.TypeAsSynonymFilterFactory -org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java index 5cb8d1586d15..ee58ab759442 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java @@ -35,7 +35,7 @@ import org.apache.lucene.analysis.charfilter.MappingCharFilter; import org.apache.lucene.analysis.charfilter.NormalizeCharMap; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.shingle.ShingleFilter; @@ -292,14 +292,13 @@ public void testCuriousWikipediaString() throws Exception { 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64, -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20 }; - @SuppressWarnings("deprecation") Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new WikipediaTokenizer(); TokenStream stream = new SopTokenFilter(tokenizer); - stream = new WordDelimiterFilter(stream, table, -50, protWords); + stream = new WordDelimiterGraphFilter(stream, false, table, 1024 - 50, protWords); stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index f015b70dee7d..0c7a07ec91d4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -81,7 +81,6 @@ import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter; import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.path.PathHierarchyTokenizer; import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; @@ -148,7 +147,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase { initBrokenConstructors(); } - @SuppressWarnings("deprecation") private static void initBrokenConstructors() { try { brokenConstructors.put( @@ -200,9 +198,6 @@ private static void initBrokenConstructors() { ValidatingTokenFilter.class, // TODO: it seems to mess up offsets!? WikipediaTokenizer.class, - // TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or - // similar following will then cause pain) - WordDelimiterFilter.class, // Cannot correct offsets when a char filter had changed them: WordDelimiterGraphFilter.class, // requires a special encoded token value, so it may fail with random data: diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java deleted file mode 100644 index 9f8ab64dddcd..000000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java +++ /dev/null @@ -1,676 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.miscellaneous; - -import java.io.IOException; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Random; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CannedTokenStream; -import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.StopFilter; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.en.EnglishAnalyzer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.IOUtils; - -/** - * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest TODO: should - * explicitly test things like protWords and not rely on the factory tests in Solr. - */ -@Deprecated -public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { - - private static final int CATENATE_ALL = WordDelimiterFilter.CATENATE_ALL; - private static final int CATENATE_NUMBERS = WordDelimiterFilter.CATENATE_NUMBERS; - private static final int CATENATE_WORDS = WordDelimiterFilter.CATENATE_WORDS; - private static final int GENERATE_NUMBER_PARTS = WordDelimiterFilter.GENERATE_NUMBER_PARTS; - private static final int GENERATE_WORD_PARTS = WordDelimiterFilter.GENERATE_WORD_PARTS; - private static final int IGNORE_KEYWORDS = WordDelimiterFilter.IGNORE_KEYWORDS; - private static final int PRESERVE_ORIGINAL = WordDelimiterFilter.PRESERVE_ORIGINAL; - private static final int SPLIT_ON_CASE_CHANGE = WordDelimiterFilter.SPLIT_ON_CASE_CHANGE; - private static final int SPLIT_ON_NUMERICS = WordDelimiterFilter.SPLIT_ON_NUMERICS; - private static final int STEM_ENGLISH_POSSESSIVE = WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; - private static final byte[] DEFAULT_WORD_DELIM_TABLE = - WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; - - /* - public void testPerformance() throws IOException { - String s = "now is the time-for all good men to come to-the aid of their country."; - Token tok = new Token(); - long start = System.currentTimeMillis(); - int ret=0; - for (int i=0; i<1000000; i++) { - StringReader r = new StringReader(s); - TokenStream ts = new WhitespaceTokenizer(r); - ts = new WordDelimiterFilter(ts, 1,1,1,1,0); - - while (ts.next(tok) != null) ret++; - } - - System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start)); - } - ***/ - - public void testOffsets() throws IOException { - int flags = - GENERATE_WORD_PARTS - | GENERATE_NUMBER_PARTS - | CATENATE_ALL - | SPLIT_ON_CASE_CHANGE - | SPLIT_ON_NUMERICS - | STEM_ENGLISH_POSSESSIVE; - // test that subwords and catenated subwords have - // the correct offsets. - WordDelimiterFilter wdf = - new WordDelimiterFilter( - new CannedTokenStream(new Token("foo-bar", 5, 12)), - DEFAULT_WORD_DELIM_TABLE, - flags, - null); - - assertTokenStreamContents( - wdf, new String[] {"foo", "foobar", "bar"}, new int[] {5, 5, 9}, new int[] {8, 12, 12}); - - wdf = - new WordDelimiterFilter( - new CannedTokenStream(new Token("foo-bar", 5, 6)), - DEFAULT_WORD_DELIM_TABLE, - flags, - null); - - assertTokenStreamContents( - wdf, new String[] {"foo", "bar", "foobar"}, new int[] {5, 5, 5}, new int[] {6, 6, 6}); - } - - public void testOffsetChange() throws Exception { - int flags = - GENERATE_WORD_PARTS - | GENERATE_NUMBER_PARTS - | CATENATE_ALL - | SPLIT_ON_CASE_CHANGE - | SPLIT_ON_NUMERICS - | STEM_ENGLISH_POSSESSIVE; - WordDelimiterFilter wdf = - new WordDelimiterFilter( - new CannedTokenStream(new Token("übelkeit)", 7, 16)), - DEFAULT_WORD_DELIM_TABLE, - flags, - null); - - assertTokenStreamContents(wdf, new String[] {"übelkeit"}, new int[] {7}, new int[] {15}); - } - - public void testOffsetChange2() throws Exception { - int flags = - GENERATE_WORD_PARTS - | GENERATE_NUMBER_PARTS - | CATENATE_ALL - | SPLIT_ON_CASE_CHANGE - | SPLIT_ON_NUMERICS - | STEM_ENGLISH_POSSESSIVE; - WordDelimiterFilter wdf = - new WordDelimiterFilter( - new CannedTokenStream(new Token("(übelkeit", 7, 17)), - DEFAULT_WORD_DELIM_TABLE, - flags, - null); - - assertTokenStreamContents(wdf, new String[] {"übelkeit"}, new int[] {8}, new int[] {17}); - } - - public void testOffsetChange3() throws Exception { - int flags = - GENERATE_WORD_PARTS - | GENERATE_NUMBER_PARTS - | CATENATE_ALL - | SPLIT_ON_CASE_CHANGE - | SPLIT_ON_NUMERICS - | STEM_ENGLISH_POSSESSIVE; - WordDelimiterFilter wdf = - new WordDelimiterFilter( - new CannedTokenStream(new Token("(übelkeit", 7, 16)), - DEFAULT_WORD_DELIM_TABLE, - flags, - null); - - assertTokenStreamContents(wdf, new String[] {"übelkeit"}, new int[] {8}, new int[] {16}); - } - - public void testOffsetChange4() throws Exception { - int flags = - GENERATE_WORD_PARTS - | GENERATE_NUMBER_PARTS - | CATENATE_ALL - | SPLIT_ON_CASE_CHANGE - | SPLIT_ON_NUMERICS - | STEM_ENGLISH_POSSESSIVE; - WordDelimiterFilter wdf = - new WordDelimiterFilter( - new CannedTokenStream(new Token("(foo,bar)", 7, 16)), - DEFAULT_WORD_DELIM_TABLE, - flags, - null); - - assertTokenStreamContents( - wdf, new String[] {"foo", "foobar", "bar"}, new int[] {8, 8, 12}, new int[] {11, 15, 15}); - } - - public void doSplit(final String input, String... output) throws Exception { - int flags = - GENERATE_WORD_PARTS - | GENERATE_NUMBER_PARTS - | SPLIT_ON_CASE_CHANGE - | SPLIT_ON_NUMERICS - | STEM_ENGLISH_POSSESSIVE; - WordDelimiterFilter wdf = - new WordDelimiterFilter( - keywordMockTokenizer(input), - WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, - flags, - null); - - assertTokenStreamContents(wdf, output); - } - - public void testSplits() throws Exception { - doSplit("basic-split", "basic", "split"); - doSplit("camelCase", "camel", "Case"); - - // non-space marking symbol shouldn't cause split - // this is an example in Thai - doSplit("\u0e1a\u0e49\u0e32\u0e19", "\u0e1a\u0e49\u0e32\u0e19"); - // possessive followed by delimiter - doSplit("test's'", "test"); - - // some russian upper and lowercase - doSplit("Роберт", "Роберт"); - // now cause a split (russian camelCase) - doSplit("РобЕрт", "Роб", "Ерт"); - - // a composed titlecase character, don't split - doSplit("aDžungla", "aDžungla"); - - // a modifier letter, don't split - doSplit("ســـــــــــــــــلام", "ســـــــــــــــــلام"); - - // enclosing mark, don't split - doSplit("test⃝", "test⃝"); - - // combining spacing mark (the virama), don't split - doSplit("हिन्दी", "हिन्दी"); - - // don't split non-ascii digits - doSplit("١٢٣٤", "١٢٣٤"); - - // don't split supplementaries into unpaired surrogates - doSplit("𠀀𠀀", "𠀀𠀀"); - } - - public void doSplitPossessive(int stemPossessive, final String input, final String... output) - throws Exception { - int flags = - GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; - flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0; - WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input), flags, null); - - assertTokenStreamContents(wdf, output); - } - - /* - * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. - */ - public void testPossessives() throws Exception { - doSplitPossessive(1, "ra's", "ra"); - doSplitPossessive(0, "ra's", "ra", "s"); - } - - /* - * Set a large position increment gap of 10 if the token is "largegap" or "/" - */ - private static final class LargePosIncTokenFilter extends TokenFilter { - private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - - protected LargePosIncTokenFilter(TokenStream input) { - super(input); - } - - @Override - public boolean incrementToken() throws IOException { - if (input.incrementToken()) { - if (termAtt.toString().equals("largegap") || termAtt.toString().equals("/")) - posIncAtt.setPositionIncrement(10); - return true; - } else { - return false; - } - } - } - - public void testPositionIncrements() throws Exception { - final int flags = - GENERATE_WORD_PARTS - | GENERATE_NUMBER_PARTS - | CATENATE_ALL - | SPLIT_ON_CASE_CHANGE - | SPLIT_ON_NUMERICS - | STEM_ENGLISH_POSSESSIVE; - final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false); - - /* analyzer that uses whitespace + wdf */ - Analyzer a = - new Analyzer() { - @Override - public TokenStreamComponents createComponents(String field) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents( - tokenizer, new WordDelimiterFilter(tokenizer, flags, protWords)); - } - }; - - /* in this case, works as expected. */ - assertAnalyzesTo( - a, - "LUCENE / SOLR", - new String[] {"LUCENE", "SOLR"}, - new int[] {0, 9}, - new int[] {6, 13}, - null, - new int[] {1, 1}, - null, - false); - - /* only in this case, posInc of 2 ?! */ - assertAnalyzesTo( - a, - "LUCENE / solR", - new String[] {"LUCENE", "sol", "solR", "R"}, - new int[] {0, 9, 9, 12}, - new int[] {6, 12, 13, 13}, - null, - new int[] {1, 1, 0, 1}, - null, - false); - - assertAnalyzesTo( - a, - "LUCENE / NUTCH SOLR", - new String[] {"LUCENE", "NUTCH", "SOLR"}, - new int[] {0, 9, 15}, - new int[] {6, 14, 19}, - null, - new int[] {1, 1, 1}, - null, - false); - - /* analyzer that will consume tokens with large position increments */ - Analyzer a2 = - new Analyzer() { - @Override - public TokenStreamComponents createComponents(String field) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents( - tokenizer, - new WordDelimiterFilter(new LargePosIncTokenFilter(tokenizer), flags, protWords)); - } - }; - - /* increment of "largegap" is preserved */ - assertAnalyzesTo( - a2, - "LUCENE largegap SOLR", - new String[] {"LUCENE", "largegap", "SOLR"}, - new int[] {0, 7, 16}, - new int[] {6, 15, 20}, - null, - new int[] {1, 10, 1}, - null, - false); - - /* the "/" had a position increment of 10, where did it go?!?!! */ - assertAnalyzesTo( - a2, - "LUCENE / SOLR", - new String[] {"LUCENE", "SOLR"}, - new int[] {0, 9}, - new int[] {6, 13}, - null, - new int[] {1, 11}, - null, - false); - - /* in this case, the increment of 10 from the "/" is carried over */ - assertAnalyzesTo( - a2, - "LUCENE / solR", - new String[] {"LUCENE", "sol", "solR", "R"}, - new int[] {0, 9, 9, 12}, - new int[] {6, 12, 13, 13}, - null, - new int[] {1, 11, 0, 1}, - null, - false); - - assertAnalyzesTo( - a2, - "LUCENE / NUTCH SOLR", - new String[] {"LUCENE", "NUTCH", "SOLR"}, - new int[] {0, 9, 15}, - new int[] {6, 14, 19}, - null, - new int[] {1, 11, 1}, - null, - false); - - Analyzer a3 = - new Analyzer() { - @Override - public TokenStreamComponents createComponents(String field) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - StopFilter filter = new StopFilter(tokenizer, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); - return new TokenStreamComponents( - tokenizer, new WordDelimiterFilter(filter, flags, protWords)); - } - }; - - assertAnalyzesTo( - a3, - "lucene.solr", - new String[] {"lucene", "lucenesolr", "solr"}, - new int[] {0, 0, 7}, - new int[] {6, 11, 11}, - null, - new int[] {1, 0, 1}, - null, - false); - - /* the stopword should add a gap here */ - assertAnalyzesTo( - a3, - "the lucene.solr", - new String[] {"lucene", "lucenesolr", "solr"}, - new int[] {4, 4, 11}, - new int[] {10, 15, 15}, - null, - new int[] {2, 0, 1}, - null, - false); - - IOUtils.close(a, a2, a3); - } - - public void testKeywordFilter() throws Exception { - assertAnalyzesTo( - keywordTestAnalyzer(GENERATE_WORD_PARTS), - "abc-def klm-nop kpop", - new String[] {"abc", "def", "klm", "nop", "kpop"}); - assertAnalyzesTo( - keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS), - "abc-def klm-nop kpop", - new String[] {"abc", "def", "klm-nop", "kpop"}, - new int[] {0, 4, 8, 16}, - new int[] {3, 7, 15, 20}, - null, - new int[] {1, 1, 1, 1}, - null, - false); - } - - private Analyzer keywordTestAnalyzer(int flags) throws Exception { - return new Analyzer() { - @Override - public TokenStreamComponents createComponents(String field) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - KeywordMarkerFilter kFilter = - new KeywordMarkerFilter(tokenizer) { - private final CharTermAttribute term = addAttribute(CharTermAttribute.class); - - @Override - public boolean isKeyword() { - // Marks terms starting with the letter 'k' as keywords - return term.toString().charAt(0) == 'k'; - } - }; - return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(kFilter, flags, null)); - } - }; - } - - /** concat numbers + words + all */ - public void testLotsOfConcatenating() throws Exception { - final int flags = - GENERATE_WORD_PARTS - | GENERATE_NUMBER_PARTS - | CATENATE_WORDS - | CATENATE_NUMBERS - | CATENATE_ALL - | SPLIT_ON_CASE_CHANGE - | SPLIT_ON_NUMERICS - | STEM_ENGLISH_POSSESSIVE; - - /* analyzer that uses whitespace + wdf */ - Analyzer a = - new Analyzer() { - @Override - public TokenStreamComponents createComponents(String field) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents( - tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); - } - }; - - assertAnalyzesTo( - a, - "abc-def-123-456", - new String[] {"abc", "abcdef", "abcdef123456", "def", "123", "123456", "456"}, - new int[] {0, 0, 0, 4, 8, 8, 12}, - new int[] {3, 7, 15, 7, 11, 15, 15}, - null, - new int[] {1, 0, 0, 1, 1, 0, 1}, - null, - false); - a.close(); - } - - /** concat numbers + words + all + preserve original */ - public void testLotsOfConcatenating2() throws Exception { - final int flags = - PRESERVE_ORIGINAL - | GENERATE_WORD_PARTS - | GENERATE_NUMBER_PARTS - | CATENATE_WORDS - | CATENATE_NUMBERS - | CATENATE_ALL - | SPLIT_ON_CASE_CHANGE - | SPLIT_ON_NUMERICS - | STEM_ENGLISH_POSSESSIVE; - - /* analyzer that uses whitespace + wdf */ - Analyzer a = - new Analyzer() { - @Override - public TokenStreamComponents createComponents(String field) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents( - tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); - } - }; - - assertAnalyzesTo( - a, - "abc-def-123-456", - new String[] { - "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" - }, - new int[] {0, 0, 0, 0, 4, 8, 8, 12}, - new int[] {15, 3, 7, 15, 7, 11, 15, 15}, - null, - new int[] {1, 0, 0, 0, 1, 1, 0, 1}, - null, - false); - a.close(); - } - - /** blast some random strings through the analyzer */ - public void testRandomStrings() throws Exception { - int numIterations = atLeast(3); - for (int i = 0; i < numIterations; i++) { - final int flags = random().nextInt(512); - final CharArraySet protectedWords; - if (random().nextBoolean()) { - protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); - } else { - protectedWords = null; - } - - Analyzer a = - new Analyzer() { - - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents( - tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); - } - }; - // TODO: properly support positionLengthAttribute - checkRandomData(random(), a, 100 * RANDOM_MULTIPLIER, 20, false, false); - a.close(); - } - } - - /** blast some enormous random strings through the analyzer */ - public void testRandomHugeStrings() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - final int flags = random().nextInt(512); - final CharArraySet protectedWords; - if (random().nextBoolean()) { - protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); - } else { - protectedWords = null; - } - - Analyzer a = - new Analyzer() { - - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents( - tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); - } - }; - // TODO: properly support positionLengthAttribute - checkRandomData(random(), a, 10 * RANDOM_MULTIPLIER, 8192, false, false); - a.close(); - } - } - - public void testEmptyTerm() throws IOException { - Random random = random(); - for (int i = 0; i < 512; i++) { - final int flags = i; - final CharArraySet protectedWords; - if (random.nextBoolean()) { - protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false); - } else { - protectedWords = null; - } - - Analyzer a = - new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new KeywordTokenizer(); - return new TokenStreamComponents( - tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); - } - }; - // depending upon options, this thing may or may not preserve the empty term - checkAnalysisConsistency(random, a, random.nextBoolean(), ""); - a.close(); - } - } - - /* - public void testToDot() throws Exception { - int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | PRESERVE_ORIGINAL | CATENATE_WORDS | CATENATE_NUMBERS | STEM_ENGLISH_POSSESSIVE; - String text = "PowerSystem2000-5-Shot's"; - WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token(text, 0, text.length())), DEFAULT_WORD_DELIM_TABLE, flags, null); - //StringWriter sw = new StringWriter(); - // TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, new PrintWriter(sw)); - PrintWriter pw = new PrintWriter("/x/tmp/before.dot"); - TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, pw); - toDot.toDot(); - pw.close(); - System.out.println("TEST DONE"); - //System.out.println("DOT:\n" + sw.toString()); - } - */ - - public void testOnlyNumbers() throws Exception { - int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; - Analyzer a = - new Analyzer() { - - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents( - tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); - } - }; - - assertAnalyzesTo( - a, "7-586", new String[] {}, new int[] {}, new int[] {}, null, new int[] {}, null, false); - } - - public void testNumberPunct() throws Exception { - int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; - Analyzer a = - new Analyzer() { - - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents( - tokenizer, new WordDelimiterFilter(tokenizer, flags, null)); - } - }; - - assertAnalyzesTo( - a, - "6-", - new String[] {"6"}, - new int[] {0}, - new int[] {1}, - null, - new int[] {1}, - null, - false); - } -} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt index b0e31cb7ec83..26d237a59436 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt @@ -23,7 +23,7 @@ fooaaa,baraaa,bazaaa GB,gib,gigabyte,gigabytes MB,mib,megabyte,megabytes Television, Televisions, TV, TVs -#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming +#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming #after us won't split it into two words. # Synonym mappings can be used for spelling correction too diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java index eea82c46911f..f77b90ba53dc 100644 --- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java +++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/HMMChineseTokenizerFactory.java @@ -25,7 +25,7 @@ * Factory for {@link HMMChineseTokenizer} * *

Note: this class will currently emit tokens for punctuation. So you should either add a - * WordDelimiterFilter after to remove these (with concatenate off), or use the SmartChinese + * WordDelimiterGraphFilter after to remove these (with concatenate off), or use the SmartChinese * stoplist with a StopFilterFactory via: * words="org/apache/lucene/analysis/cn/smart/stopwords.txt" * diff --git a/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/fragments/analysis/StepByStepAnalyzeResultPanelProvider.java b/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/fragments/analysis/StepByStepAnalyzeResultPanelProvider.java index ccae64d3b662..7a5600cb03bc 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/fragments/analysis/StepByStepAnalyzeResultPanelProvider.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/fragments/analysis/StepByStepAnalyzeResultPanelProvider.java @@ -370,7 +370,7 @@ public int getColumnWidth() { // Currently this only show each tokenizer/filters result independently, // so the result doesn't show deletion/separation by next filter, - // e.g. "library" by WordDelimiterFilter is different position between other output. + // e.g. "library" by WordDelimiterGraphFilter is different position between other output. NamedTokensTableModel(List namedTokens) { int maxColumnSize = 0; Analysis.NamedTokens namedToken;