diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..eab767a --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +build-lib +build +dist diff --git a/README.md b/README.md index ffc75df..9ca7361 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,23 @@ +# Notes about this fork + +This project is a fork of https://github.com/lucidworks/query-autofiltering-component, but includes the following changes to the 5.x code base: + +* Resolved issues with Ivy dependencies. +* Upgraded component to work with Solr 5.3.1 and above. +* Added support for using a field whitelist. + +The whilelist field definition feature was implemented to solve a gap regarding dynamic fields. Although asking the Luke admin handler would have been another implementation option, it seems that a generic whitelist is more powerful; albeit, a bit more verbose in the configuration. Simply define the following: + +
+ <searchComponent name="autofilter" class="org.apache.solr.handler.component.QueryAutoFilteringComponent" >
+   <arr name="whitelistFields">
+   <str>field1</str>
+   <str>field2</str>
+   <str>fieldN</str>
+   </arr>
+ </searchComponent>
+
+ # query-autofiltering-component A Query Autofiltering SearchComponent for Solr that can translate free-text queries into structured queries using index metadata. diff --git a/solr5.x/build.xml b/solr5.x/build.xml index 6e74893..30b91f9 100644 --- a/solr5.x/build.xml +++ b/solr5.x/build.xml @@ -20,8 +20,8 @@ - - + + diff --git a/solr5.x/ivy/ivy-settings.xml b/solr5.x/ivy/ivy-settings.xml index 19d4394..5680754 100644 --- a/solr5.x/ivy/ivy-settings.xml +++ b/solr5.x/ivy/ivy-settings.xml @@ -1,7 +1,11 @@ - - + + + + + + diff --git a/solr5.x/src/main/java/org/apache/solr/handler/component/QueryAutoFilteringComponent.java b/solr5.x/src/main/java/org/apache/solr/handler/component/QueryAutoFilteringComponent.java index eaf13a1..9fbc43a 100644 --- a/solr5.x/src/main/java/org/apache/solr/handler/component/QueryAutoFilteringComponent.java +++ b/solr5.x/src/main/java/org/apache/solr/handler/component/QueryAutoFilteringComponent.java @@ -98,6 +98,7 @@ public class QueryAutoFilteringComponent extends QueryComponent implements SolrC private String termsHandler = "/terms"; + private HashSet whitelistFields;; private HashSet excludeFields; private HashSet stopwords; @@ -115,6 +116,15 @@ public class QueryAutoFilteringComponent extends QueryComponent implements SolrC @Override public void init( NamedList initArgs ) { + + List whitelistFields = (List) initArgs.get("whitelistFields"); + if (whitelistFields != null) { + this.whitelistFields = new HashSet( ); + for (String field : whitelistFields ) { + this.whitelistFields.add( field ); + } + } + List excludeFields = (List) initArgs.get("excludeFields"); if (excludeFields != null) { this.excludeFields = new HashSet( ); @@ -469,16 +479,15 @@ else if (qbuilder.length() > 0 && fieldMap.size() > 0) { } } else { // boostFactor is NOT null - // use the original query add fielded boost clauses + // use the bq field to add fielded boost clauses StringBuilder bbuilder = new StringBuilder( ); String boostSuffix = "^" + boostFactor.toString( ); - bbuilder.append( getPhrase( queryTokens, 0, queryTokens.size() - 1, " " ) ); for (String fieldName : fieldMap.keySet( ) ) { bbuilder.append( " " ); bbuilder.append( getFilterQuery( rb, fieldName, fieldMap.get( fieldName ), fieldPositionMap.get( fieldName ), queryTokens, boostSuffix ) ); } - Log.info( "setting q = '" + bbuilder.toString() + "'" ); - modParams.set( "q", bbuilder.toString( ) ); + Log.info( "adding bq = '" + bbuilder.toString() + "'" ); + modParams.add( "bq", bbuilder.toString( ).trim() ); } return true; } @@ -729,23 +738,35 @@ private void buildFieldMap( ResponseBuilder rb ) throws IOException { // TODO: Filter this by the configuration fields ... private ArrayList getStringFields( SolrIndexSearcher searcher ) { - IndexSchema schema = searcher.getSchema(); + ArrayList strFields = new ArrayList( ); - - Collection fieldNames = searcher.getFieldNames(); - Iterator fnIt = fieldNames.iterator(); - while ( fnIt.hasNext() ) { - String fieldName = fnIt.next( ); - if (excludeFields == null || !excludeFields.contains( fieldName )) { - SchemaField field = schema.getField(fieldName); - if (field.stored() && field.getType() instanceof StrField ) { - strFields.add( fieldName ); + + if ( hasWhitelist() ) { + Log.info("Using whitelist fields instead of schema."); + for ( String fieldName: whitelistFields ) { + strFields.add( fieldName ); + } + } else { + IndexSchema schema = searcher.getSchema(); + Collection fieldNames = searcher.getFieldNames(); + Iterator fnIt = fieldNames.iterator(); + while ( fnIt.hasNext() ) { + String fieldName = fnIt.next( ); + if (excludeFields == null || !excludeFields.contains( fieldName )) { + SchemaField field = schema.getField(fieldName); + if (field.stored() && field.getType() instanceof StrField ) { + strFields.add( fieldName ); + } } } } - + return strFields; } + + private boolean hasWhitelist() { + return this.whitelistFields != null && this.whitelistFields.size() > 0; + } private void addTerm( CharsRef fieldChars, String fieldValue, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder ) throws IOException { @@ -804,11 +825,13 @@ private void addDistributedTerms( ResponseBuilder rb, SynonymMap.Builder fieldBu ShardHandlerFactory shardHandlerFactory = container.getShardHandlerFactory( ); ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); - shardHandler.checkDistributed( rb ); - - Log.debug( "Is Distributed = " + rb.isDistrib ); + + final SolrParams distribParams = rb.req.getParams(); + final boolean isDistrib = distribParams.get(ShardParams.SHARDS) != null; + Log.debug( "Is Distributed = " + isDistrib ); - if( rb.isDistrib ) { + if( isDistrib ) { + shardHandler.prepDistributed( rb ); // create a ShardRequest that contains a Terms Request. // don't send to this shard??? ShardRequest sreq = new ShardRequest(); diff --git a/solr5.x/src/test/resources/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml b/solr5.x/src/test/resources/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml index 7514aa4..13f8214 100644 --- a/solr5.x/src/test/resources/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml +++ b/solr5.x/src/test/resources/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml @@ -31,7 +31,7 @@ A solrconfig.xml snippet containing indexConfig settings for randomized testing. ${useCompoundFile:false} ${solr.tests.maxBufferedDocs} - ${solr.tests.maxIndexingThreads} + ${solr.tests.maxIndexingThreads:8} ${solr.tests.ramBufferSizeMB} diff --git a/solr6.x/build.xml b/solr6.x/build.xml new file mode 100644 index 0000000..c17f314 --- /dev/null +++ b/solr6.x/build.xml @@ -0,0 +1,127 @@ + + Builds Query Autofiltering Component + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr6.x/ivy.xml b/solr6.x/ivy.xml new file mode 100644 index 0000000..f0e8ae2 --- /dev/null +++ b/solr6.x/ivy.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/solr6.x/ivy/ivy-settings.xml b/solr6.x/ivy/ivy-settings.xml new file mode 100644 index 0000000..5680754 --- /dev/null +++ b/solr6.x/ivy/ivy-settings.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/solr6.x/src/main/java/org/apache/solr/handler/component/QueryAutoFilteringComponent.java b/solr6.x/src/main/java/org/apache/solr/handler/component/QueryAutoFilteringComponent.java new file mode 100644 index 0000000..f98c6ca --- /dev/null +++ b/solr6.x/src/main/java/org/apache/solr/handler/component/QueryAutoFilteringComponent.java @@ -0,0 +1,1420 @@ +package org.apache.solr.handler.component; + +import org.apache.commons.lang.StringUtils; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.uninverting.UninvertingReader; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.fst.FST; +import org.apache.solr.client.solrj.response.TermsResponse; +import org.apache.solr.common.params.*; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.CoreContainer; +import org.apache.solr.core.SolrCore; +import org.apache.solr.core.SolrEventListener; +import org.apache.solr.core.SolrResourceLoader; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.StrField; +import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.util.plugin.SolrCoreAware; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.StringReader; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; +import java.util.*; + +/** + * Creates filter or boost queries from freetext queries based on pattern matches with terms in stored String fields. Uses + * the FieldCache (UninvertingIndex) to build a map of term to search field. This map is then used to parse the + * query to detect phrases that map to specific field values. These field/value pairs can then be used to generate + * a filter query or a boost query if recall needs to be preserved. + * + * For SolrCloud, this component requires that the TermsComponent be defined in solrconfig.xml. This is used + * to get distributed term maps. + * + * Compiles with Solr 5.x + */ + +public class QueryAutoFilteringComponent extends QueryComponent implements SolrCoreAware, SolrEventListener { + + private static final Logger Log = LoggerFactory.getLogger( QueryAutoFilteringComponent.class ); + + public static final String MINIMUM_TOKENS = "mt"; + public static final String BOOST_PARAM = "afb"; + + private SynonymMap fieldMap; // Map of search terms to fieldName + private SynonymMap synonyms; // synonyms from synonyms.txt + private SynonymMap termMap; // Map of search term to indexed term + + private String synonymsFile; + + private NamedList initParams; + + private boolean initFieldMap = false; + + private String termsHandler = "/terms"; + + private HashSet whitelistFields;; + private HashSet excludeFields; + private HashSet stopwords; + + private Integer boostFactor; // if null, use Filter Query + + // For multiple terms in the same field, if field is multi-valued = use AND for filter query + private boolean useAndForMultiValuedFields = true; + + private String fieldDelim = "|"; + + private String fieldSplitExpr = "\\|"; + + // map of a "verb" phrase to a metadata field + private ArrayList verbModifierList; + + @Override + public void init( NamedList initArgs ) { + List whitelistFields = (List) initArgs.get("whitelistFields"); + if (whitelistFields != null) { + this.whitelistFields = new HashSet( ); + for (String field : whitelistFields ) { + this.whitelistFields.add( field ); + } + } + + List excludeFields = (List) initArgs.get("excludeFields"); + if (excludeFields != null) { + this.excludeFields = new HashSet( ); + for (String field : excludeFields ) { + this.excludeFields.add( field ); + } + } + + List verbModifiers = (List)initArgs.get( "verbModifiers" ); + if (verbModifiers != null) { + this.verbModifierList = new ArrayList( ); + for (String modifier : verbModifiers) { + String modifierPhrase = new String( modifier.substring( 0, modifier.indexOf( ":" ))); + String modifierFields = new String( modifier.substring( modifier.indexOf( ":" ) + 1 )); + + if (modifierPhrase.indexOf( "," ) > 0) { + String[] phrases = modifierPhrase.split( "," ); + for (int i = 0; i < phrases.length; i++) { + addModifier( phrases[i], modifierFields ); + } + } + else { + addModifier( modifierPhrase, modifierFields ); + } + } + } + + Integer boostFactor = (Integer)initArgs.get( "boostFactor" ); + if (boostFactor != null) { + this.boostFactor = boostFactor; + } + + String useAndForMV = (String)initArgs.get( "useAndForMultiValuedFields" ); + if (useAndForMV != null) { + this.useAndForMultiValuedFields = useAndForMV.equalsIgnoreCase( "true" ); + } + + String useFieldDelim = (String)initArgs.get( "fieldDelimiter" ); + if (useFieldDelim != null) { + this.fieldDelim = useFieldDelim; + this.fieldSplitExpr = useFieldDelim; + } + + initParams = initArgs; + } + + private void addModifier( String modifierPhrase, String modifierFields ) { + Log.info( "addModifier: " + modifierPhrase + ": " + modifierFields ); + ModifierDefinition modDef = new ModifierDefinition( ); + modDef.modifierPhrase = modifierPhrase.toLowerCase( ); + + if (modifierFields.indexOf( fieldDelim ) > 0) { + modDef.filterFields = new HashMap( ); + String fieldPairs = new String( modifierFields.substring( modifierFields.indexOf( fieldDelim ) + 1 )); + modifierFields = new String( modifierFields.substring( 0, modifierFields.indexOf( fieldDelim ))); + Log.info( "fieldPairs = " + fieldPairs ); + + String modifierTemplate = null; + if (fieldPairs.indexOf( fieldDelim ) > 0) { + modifierTemplate = new String( fieldPairs.substring( fieldPairs.indexOf( fieldDelim ) + 1 )); + fieldPairs = new String( fieldPairs.substring( 0, fieldPairs.indexOf( fieldDelim ))); + } + + if (fieldPairs.indexOf( "," ) > 0) { + String[] fieldPairList = fieldPairs.split( "," ); + for (int i = 0; i < fieldPairList.length; i++) { + String field = new String( fieldPairList[i].substring( 0, fieldPairList[i].indexOf( ":" ))); + String value = new String(fieldPairList[i].substring( fieldPairList[i].indexOf( ":" ) + 1 )); + modDef.filterFields.put( field, value ); + } + } + else { + String field = new String(fieldPairs.substring( 0, fieldPairs.indexOf( ":" ))); + String value = new String( fieldPairs.substring( fieldPairs.indexOf( ":" ) + 1 )); + modDef.filterFields.put( field, value ); + } + + if (modifierTemplate != null) { + modDef.templateRule = new ModifierTemplateRule( modifierTemplate ); + } + } + modDef.modifierFields = new ArrayList( ); + if (modifierFields.indexOf( "," ) > 0) { + String[] fields = modifierFields.split( "," ); + for (int i = 0; i < fields.length; i++) { + modDef.modifierFields.add( fields[i] ); + } + } + else { + modDef.modifierFields.add( modifierFields ); + } + + modDef.modTokens = modDef.modifierPhrase.split( " " ); + verbModifierList.add( modDef ); + } + + + @Override + public void inform( SolrCore core ) { + if (initParams != null) { + SolrResourceLoader resourceLoader = core.getResourceLoader( ); + + synonymsFile = (String)initParams.get( "synonyms" ); + if (synonymsFile != null) { + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents(tokenizer, tokenizer ); + } + }; + + try { + SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); + CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + parser.parse(new InputStreamReader( resourceLoader.openResource(synonymsFile), decoder)); + this.synonyms = parser.build( ); + } + catch ( Exception e ) { + // ??? + Log.warn( "Parsing Synonyms Got Exception " + e ); + } + } + + String stopwordsFile = (String)initParams.get( "stopwords" ); + if (stopwordsFile != null) { + this.stopwords = new HashSet( ); + try { + BufferedReader br = new BufferedReader( new InputStreamReader( resourceLoader.openResource( stopwordsFile ))); + String line = null; + while ((line = br.readLine( )) != null) { + stopwords.add( line.toLowerCase( ) ); + } + br.close( ); + } + catch ( IOException ioe ) { + Log.warn( "Adding Stopwords Got Exception " + ioe ); + } + } + } + + core.registerFirstSearcherListener( this ); + core.registerNewSearcherListener( this ); + } + + @Override + public void postCommit() { } + + @Override + public void postSoftCommit() { } + + public void newSearcher(SolrIndexSearcher newSearcher, SolrIndexSearcher currentSearcher) { + synchronized( this ) { + initFieldMap = true; + } + } + + @Override + public void prepare( ResponseBuilder rb ) throws IOException + { + SolrQueryRequest req = rb.req; + SolrParams params = req.getParams( ); + + // Only build the field map and do the processing if we are the main event + String isShard = params.get( "isShard" ); + if (isShard != null && isShard.equals( "true" )) { + Log.debug( "A shard query: don't process!" ); + return; + } + + Log.info( "prepare ..." ); + if (initFieldMap) { + synchronized( this ) { + buildFieldMap( rb ); + initFieldMap = false; + } + } + + int mintok = 1; + String mt = params.get( MINIMUM_TOKENS ); + if ( mt != null ) { + try { + mintok = Integer.parseInt( mt ); + } + catch ( NumberFormatException nfe ) { + // ??? + mintok = 1; + } + } + + String qStr = params.get( CommonParams.Q ); + Log.debug( "query is: " + qStr ); + if (qStr.equals( "*" ) || qStr.indexOf( ":" ) > 0) { + Log.debug( "Complex query - do not process" ); + return; + } + + // tokenize the query string, if any part of it matches, remove the token from the list and + // add a filter query with :value: + ArrayList queryTokens = tokenize( qStr ); + + if (queryTokens.size( ) >= mintok) { + ModifiableSolrParams modParams = new ModifiableSolrParams( params ); + if (findPattern( queryTokens, rb, modParams )) { + req.setParams( modParams ); + } + } + } + + /** + * If this method is not overridden then this will cause a request against + * the Shards causing performance degredation and duplicate values in the + * facet counts. + * Here we just return that this is done leaving it up to the Query to drive + * the requests. + * + * @param rb Ignored + * @return ResponseBuilder.STAGE_DONE + * @throws IOException never thrown + */ + @Override + public int distributedProcess(ResponseBuilder rb) throws IOException { + return ResponseBuilder.STAGE_DONE; + } + + private boolean findPattern( ArrayList queryTokens, ResponseBuilder rb, ModifiableSolrParams modParams ) throws IOException { + Log.debug( "findPattern " ); + + HashSet usedTokens = new HashSet( ); + HashMap> fieldMap = new HashMap>( ); + HashMap fieldPositionMap = new HashMap( ); + HashMap entityPositionMap = (verbModifierList != null) ? new HashMap() : null; + + String longestPhraseField = null; + int startToken = 0; + int lastEndToken = 0; + while ( startToken < queryTokens.size() ) { + Log.debug( "startToken = " + startToken ); + int endToken = startToken; + + while ( endToken < queryTokens.size( ) ) { + // FieldName can be comma separated if there are more than one field name for a set of tokens + String fieldName = getFieldNameFor( queryTokens, startToken, endToken ); + if ( fieldName != null ) { + longestPhraseField = fieldName; + lastEndToken = endToken; + } + else if ( longestPhraseField != null ) { + break; + } + ++endToken; + } + + if (longestPhraseField != null) { + // create matching phrase from startToken -> endToken + String phrase = getPhrase( queryTokens, startToken, lastEndToken ); + Log.debug( "get Indexed Term for " + phrase ); + String indexedTerm = getMappedFieldName( termMap, phrase.toLowerCase( )); + if (indexedTerm == null) { + indexedTerm = getMappedFieldName( termMap, getStemmed( phrase )); + } + if (indexedTerm != null) { + indexedTerm = indexedTerm.replace( '_', ' ' ); + if (indexedTerm.indexOf( " " ) > 0 ) { + indexedTerm = "\"" + indexedTerm + "\""; + } + ArrayListvalList = fieldMap.get( longestPhraseField ); + if (valList == null) { + valList = new ArrayList( ); + fieldMap.put( longestPhraseField, valList ); + } + + Log.info( "indexedTerm: " + indexedTerm ); + int[] entityPosition = null; + if (entityPositionMap != null) { + entityPosition = new int[2]; + entityPosition[0] = startToken; + entityPosition[1] = endToken-1; + } + + Log.debug( "indexedTerm: " + indexedTerm ); + if (indexedTerm.indexOf( fieldDelim ) > 0) + { + String[] indexedTerms = indexedTerm.split( fieldSplitExpr ); + for (int t = 0; t < indexedTerms.length; t++) { + valList.add( indexedTerms[t] ); + if (entityPositionMap != null) entityPositionMap.put( indexedTerms[t], entityPosition ); + } + } + else { + valList.add( indexedTerm ); + if (entityPositionMap != null) entityPositionMap.put( indexedTerm, entityPosition ); + } + + // save startToken and lastEndToken so can use for boolean operator context + // for multi-value fields -save the min and max of all tokens positions for the field + int[] posArray = fieldPositionMap.get( longestPhraseField ); + if (posArray == null) + { + posArray = new int[2]; + posArray[0] = startToken; + posArray[1] = lastEndToken; + fieldPositionMap.put( longestPhraseField, posArray ); + } + else + { + posArray[1] = lastEndToken; + } + + longestPhraseField = null; + for (int i = startToken; i <= lastEndToken; i++) { + Log.debug( "adding used token at " + i ); + usedTokens.add( new Integer( i ) ); + } + startToken = lastEndToken + 1; + } + } + else { + ++startToken; + } + } + + if (usedTokens.size( ) > 0) { + + // filter field maps based on verbs here: + if (entityPositionMap != null) { + filterFieldMap( queryTokens, fieldMap, entityPositionMap, fieldPositionMap ); + } + + String useBoost = modParams.get( BOOST_PARAM ); + Integer boostFactor = (useBoost != null) ? new Integer( useBoost ) : this.boostFactor; + if (boostFactor == null) { + StringBuilder qbuilder = new StringBuilder( ); + if (usedTokens.size( ) < queryTokens.size( ) ) { + for (int i = 0; i < queryTokens.size(); i++) { + if (boostFactor != null || usedTokens.contains( new Integer( i ) ) == false ) { + char[] token = queryTokens.get( i ); + if (qbuilder.length() > 0) qbuilder.append( " " ); + qbuilder.append( token ); + } + } + } + + Log.debug( "got qbuilder string = '" + qbuilder.toString() + "'" ); + if (qbuilder.length() == 0 && fieldMap.size() > 0) { + // build a filter query - + // EH: can't do this if dismax + Log.debug( "setting q = *:*" ); + modParams.set( "q", "*:*" ); + for (String fieldName : fieldMap.keySet() ) { + String fq = getFilterQuery( rb, fieldName, fieldMap.get( fieldName ), fieldPositionMap.get( fieldName ), queryTokens, "" ); + Log.info( "adding filter query: " + fq ); + modParams.add( "fq", fq ); + } + } + else if (qbuilder.length() > 0 && fieldMap.size() > 0) { + // build a boolean query for the fielded data, OR with remainder of query + StringBuilder boolQ = new StringBuilder( ); + for (String fieldName : fieldMap.keySet() ) { + if (boolQ.length() > 0) boolQ.append( " AND " ); + boolQ.append( getFilterQuery( rb, fieldName, fieldMap.get( fieldName ), fieldPositionMap.get( fieldName ), queryTokens, "" ) ); + } + String q = qbuilder.toString( ) + " (" + boolQ.toString() + ")"; + Log.info( "setting q = '" + q + "'" ); + modParams.set( "q", q ); + } + } + else { // boostFactor is NOT null + // use the bq field to add fielded boost clauses + StringBuilder bbuilder = new StringBuilder( ); + String boostSuffix = "^" + boostFactor.toString( ); + for (String fieldName : fieldMap.keySet( ) ) { + bbuilder.append( " " ); + bbuilder.append( getFilterQuery( rb, fieldName, fieldMap.get( fieldName ), fieldPositionMap.get( fieldName ), queryTokens, boostSuffix ) ); + } + Log.info( "adding bq = '" + bbuilder.toString() + "'" ); + modParams.add( "bq", bbuilder.toString( ).trim() ); + } + return true; + } + + return false; + } + + private String getPhrase( ArrayList tokens, int startToken, int endToken ) { + return getPhrase( tokens, startToken, endToken, "_" ); + } + + private String getPhrase( ArrayList tokens, int startToken, int endToken, String tokenSep ) { + StringBuilder strb = new StringBuilder( ); + for (int i = startToken; i <= endToken; i++) { + if (i > startToken) { strb.append( tokenSep ); } + + strb.append( tokens.get( i ) ); + } + Log.debug( "getPhrase returns " + strb.toString( ) ); + return strb.toString( ); + } + + private String getFilterQuery( ResponseBuilder rb, String fieldName, ArrayList valList, + int[] termPosRange, ArrayList queryTokens, String suffix) { + if (fieldName.indexOf( fieldDelim ) > 0) { + return getFilterQuery( rb, fieldName.split( fieldSplitExpr ), valList, termPosRange, queryTokens, suffix ); + } + if (valList.size() == 1) { + // check if valList[0] is multi-term - if so, check if there is a single term equivalent + // if this returns non-null, create an OR query with single term version + // example "white linen perfume" vs "white linen shirt" where "White Linen" is a brand + String term = valList.get( 0 ); + + if (term.indexOf( " " ) > 0) { + String singleTermQuery = getSingleTermQuery( term ); + if (singleTermQuery != null) { + StringBuilder strb = new StringBuilder( ); + // EH: possible meta-escaping problem if value includes {!field f=}value + strb.append( "(" ).append( fieldName ).append( ":" ) + .append( term ).append( " OR (" ).append( singleTermQuery ).append( "))" ).append( suffix ); + Log.debug( "returning composite query: " + strb.toString( ) ); + return strb.toString( ); + } + } + + String query = fieldName + ":" + term + suffix; + Log.debug( "returning single query: " + query ); + return query; + } + else { + SolrIndexSearcher searcher = rb.req.getSearcher(); + IndexSchema schema = searcher.getSchema(); + SchemaField field = schema.getField(fieldName); + boolean useAnd = field.multiValued() && useAndForMultiValuedFields; + // if query has 'or' in it and or is at a position 'within' the values for this field ... + if (useAnd) { + for (int i = termPosRange[0] + 1; i < termPosRange[1]; i++ ) { + char[] qToken = queryTokens.get( i ); + // is the token 'or'? + if (qToken.length == 2 && qToken[0] == 'o' && qToken[1] == 'r' ) { + useAnd = false; + break; + } + } + } + + StringBuilder orQ = new StringBuilder( ); + for (String val : valList ) { + if (orQ.length() > 0) orQ.append( (useAnd ? " AND " : " OR ") ); + orQ.append( val ); + } + return fieldName + ":(" + orQ.toString() + ")" + suffix; + } + } + + private String getFilterQuery( ResponseBuilder rb, String[] fieldNames, ArrayList valList, + int[] termPosRange, ArrayList queryTokens, String suffix) { + StringBuilder filterQBuilder = new StringBuilder( ); + for (int i = 0; i < fieldNames.length; i++) { + if (i > 0) filterQBuilder.append( " OR " ); + filterQBuilder.append( getFilterQuery( rb, fieldNames[i], valList, termPosRange, queryTokens, suffix ) ); + } + return "(" + filterQBuilder.toString() + ")"; + } + + private String getFieldNameFor( ArrayList queryTokens, int startToken, int endToken ) throws IOException { + String phrase = getPhrase( queryTokens, startToken, endToken ); + String fieldName = getFieldNameFor( phrase.toLowerCase( ) ); + if (fieldName != null) return fieldName; + + String stemmed = getStemmed( phrase ); + Log.debug( "checking stemmed " + stemmed ); + return (stemmed.equals( phrase )) ? null : getFieldNameFor( stemmed ); + } + + private String getSingleTermQuery( String multiTermValue ) { + + String multiTerm = multiTermValue; + if (multiTermValue.startsWith( "\"" )) { + multiTerm = new String( multiTermValue.substring( 1, multiTermValue.lastIndexOf( "\"" ))); + } + Log.debug( "getSingleTermQuery " + multiTerm + "" ); + + try { + StringBuilder strb = new StringBuilder( ); + + String[] terms = multiTerm.split( " " ); + for (int i = 0; i < terms.length; i++) { + if (i > 0) strb.append( " AND " ); + + String fieldName = getFieldNameFor( terms[i].toLowerCase( ) ); + Log.debug( "fieldName for " + terms[i].toLowerCase( ) + " is " + fieldName ); + if (fieldName == null) return null; + + if (fieldName.indexOf( fieldDelim ) > 0) { + String[] fields = fieldName.split( fieldSplitExpr ); + strb.append( "(" ); + for (int f = 0; f < fields.length; f++) { + if (f > 0) strb.append( " OR " ); + strb.append( fields[f] ).append( ":" ).append( getMappedFieldName( termMap, terms[i].toLowerCase( ) ) ); + } + strb.append( ")" ); + } + else { + strb.append( fieldName ).append( ":" ).append( getMappedFieldName( termMap, terms[i].toLowerCase( ) ) ); + } + } + + Log.debug( "getSingleTermQuery returns: '" + strb.toString( ) + "'" ); + return strb.toString( ); + } + catch (IOException ioe ) { + return null; + } + } + + private String getFieldNameFor( String phrase ) throws IOException { + Log.debug( "getFieldNameFor '" + phrase + "'" ); + return ("*".equals( phrase) || "* *".equals( phrase )) ? null : getMappedFieldName( fieldMap, phrase.toLowerCase( ) ); + } + + + // TODO: Return comma separated string if more than one + private String getMappedFieldName( SynonymMap termMap, String phrase ) throws IOException { + Log.debug( "getMappedFieldName: '" + phrase + "'" ); + FST fst = termMap.fst; + if(fst != null) { + FST.BytesReader fstReader = fst.getBytesReader(); + FST.Arc scratchArc = new FST.Arc<>(); + BytesRef scratchBytes = new BytesRef(); + CharsRefBuilder scratchChars = new CharsRefBuilder(); + ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + BytesRef pendingOutput = fst.outputs.getNoOutput(); + fst.getFirstArc(scratchArc); + BytesRef matchOutput = null; + + String noSpPhrase = phrase.replace(' ', '_'); + int charPos = 0; + while (charPos < noSpPhrase.length()) { + final int codePoint = noSpPhrase.codePointAt(charPos); + if (fst.findTargetArc(codePoint, scratchArc, scratchArc, fstReader) == null) { + Log.debug("No FieldName for " + phrase); + return null; + } + + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); + charPos += Character.charCount(codePoint); + } + + if (scratchArc.isFinal()) { + Log.debug("creating matchOutput"); + matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); + ArrayList mappedFields = new ArrayList(); + bytesReader.reset(matchOutput.bytes, matchOutput.offset, matchOutput.length); + + final int code = bytesReader.readVInt(); + final int count = code >>> 1; + for (int outputIDX = 0; outputIDX < count; outputIDX++) { + termMap.words.get(bytesReader.readVInt(), scratchBytes); + scratchChars.copyUTF8Bytes(scratchBytes); + int lastStart = 0; + final int chEnd = lastStart + scratchChars.length(); + for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) { + if (chIDX == chEnd || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) { + int outputLen = chIDX - lastStart; + assert outputLen > 0 : "output contains empty string: " + scratchChars; + mappedFields.add(new String(scratchChars.chars(), lastStart, outputLen)); + lastStart = chIDX + 1; + } + } + } + + if (mappedFields.size() == 1) { + Log.debug("returning mapped fieldName " + mappedFields.get(0)); + return mappedFields.get(0); + } else { + StringBuilder fieldBuilder = new StringBuilder(); + for (String fieldName : mappedFields) { + if (fieldBuilder.length() > 0) fieldBuilder.append(fieldDelim); + fieldBuilder.append(fieldName); + } + Log.debug("returning mapped fieldName " + fieldBuilder.toString()); + return fieldBuilder.toString(); + } + } + } else { + Log.debug("Finite State Machine is null on Synonym Map -> ignored"); + } + + // Surpressing this message since it is very chatty in production. + Log.debug( "matchOutput but no FieldName for " + phrase ); + return null; + } + + + private void buildFieldMap( ResponseBuilder rb ) throws IOException { + Log.debug( "buildFieldMap" ); + SolrIndexSearcher searcher = rb.req.getSearcher(); + // build a synonym map from the SortedDocValues - + // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue + SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true ); + SynonymMap.Builder termBuilder = new SynonymMap.Builder( true ); + + HashMap fieldTypeMap = new HashMap( ); + + ArrayList searchFields = getStringFields( searcher ); + for (String searchField : searchFields ) { + fieldTypeMap.put( searchField, UninvertingReader.Type.SORTED_SET_BINARY); + } + UninvertingReader unvRead = new UninvertingReader( searcher.getLeafReader( ), fieldTypeMap ); + + for (String searchField : searchFields ) { + Log.debug( "adding searchField " + searchField ); + CharsRef fieldChars = new CharsRef( searchField ); + SortedSetDocValues sdv = unvRead.getSortedSetDocValues( searchField ); + if (sdv == null) continue; + Log.debug( "got SortedSetDocValues for " + searchField ); + TermsEnum te = sdv.termsEnum(); + while (te.next() != null) { + BytesRef term = te.term(); + String fieldValue = term.utf8ToString( ); + if (StringUtils.isNotEmpty(fieldValue)) { + addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder ); + } + } + } + + addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields ); + + fieldMap = fieldBuilder.build( ); + termMap = termBuilder.build( ); + } + + // TODO: Filter this by the configuration fields ... + private ArrayList getStringFields( SolrIndexSearcher searcher ) { + + ArrayList strFields = new ArrayList( ); + + if ( hasWhitelist() ) { + Log.info("Using whitelist fields instead of schema."); + for ( String fieldName: whitelistFields ) { + strFields.add( fieldName ); + } + } else { + IndexSchema schema = searcher.getSchema(); + Iterable fieldNames = searcher.getFieldNames(); + Iterator fnIt = fieldNames.iterator(); + + while ( fnIt.hasNext() ) { + String fieldName = fnIt.next( ); + if (excludeFields == null || !excludeFields.contains( fieldName )) { + SchemaField field = schema.getField(fieldName); + if (field.stored() && field.getType() instanceof StrField ) { + strFields.add( fieldName ); + } + } + } + } + + return strFields; + } + + private boolean hasWhitelist() { + return this.whitelistFields != null && this.whitelistFields.size() > 0; + } + + private void addTerm( CharsRef fieldChars, String fieldValue, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder ) throws IOException { + + Log.debug( "got fieldValue: '" + fieldValue + "'" ); + String nospVal = fieldValue.replace( ' ', '_' ); + Log.debug( "got nspace: '" + nospVal + "'" ); + CharsRef nospChars = new CharsRef( nospVal ); + CharsRef valueChars = new CharsRef( fieldValue ); + + fieldBuilder.add( nospChars, fieldChars, false ); + termBuilder.add( nospChars, valueChars, false ); + + // lower case term, + String lowercase = nospVal.toLowerCase( ); + CharsRef lcChars = new CharsRef( lowercase ); + fieldBuilder.add( lcChars, fieldChars, false ); + termBuilder.add( lcChars, valueChars, false ); + + // stem it + String stemmed = getStemmed( nospVal ); + if (stemmed.equals( fieldValue ) == false) { + Log.debug( "adding stemmed: " + stemmed ); + CharsRef stChars = new CharsRef( stemmed ); + fieldBuilder.add( stChars, fieldChars, false ); + termBuilder.add( stChars, valueChars, false ); + } + + if (this.synonyms != null) { + // get synonyms from synonyms.txt + ArrayList synonymLst = getSynonymsFor( this.synonyms, fieldValue ); + if ( synonymLst != null ) { + for (String synonym : synonymLst ) { + String nospSyn = synonym.replace( ' ', '_' ); + Log.debug( "adding: " + synonym + " -> " + fieldValue ); + CharsRef synChars = new CharsRef( nospSyn ); + fieldBuilder.add( synChars, fieldChars, false ); + termBuilder.add( synChars, valueChars, false ); + } + } + synonymLst = getSynonymsFor( this.synonyms, fieldValue.toLowerCase() ); + if ( synonymLst != null ) { + for (String synonym : synonymLst ) { + String nospSyn = synonym.replace( ' ', '_' ); + Log.debug( "adding: " + synonym + " -> " + fieldValue ); + CharsRef synChars = new CharsRef( nospSyn ); + fieldBuilder.add( synChars, fieldChars, false ); + termBuilder.add( synChars, valueChars, false ); + } + } + } + } + + private void addDistributedTerms( ResponseBuilder rb, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder, ArrayList searchFields ) throws IOException { + SolrIndexSearcher searcher = rb.req.getSearcher(); + CoreContainer container = searcher.getCore().getCoreDescriptor().getCoreContainer(); + + ShardHandlerFactory shardHandlerFactory = container.getShardHandlerFactory( ); + ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); + + final SolrParams distribParams = rb.req.getParams(); + final boolean isDistrib = distribParams.get(ShardParams.SHARDS) != null; + Log.debug( "Is Distributed = " + isDistrib ); + + if( isDistrib ) { + shardHandler.prepDistributed( rb ); + // create a ShardRequest that contains a Terms Request. + // don't send to this shard??? + ShardRequest sreq = new ShardRequest(); + sreq.purpose = ShardRequest.PURPOSE_GET_TERMS; + sreq.actualShards = rb.shards; + ModifiableSolrParams params = new ModifiableSolrParams( ); + + params.set( TermsParams.TERMS_LIMIT, -1); + params.set( TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_INDEX); + String[] fields = searchFields.toArray( new String[ searchFields.size( )] ); + params.set( TermsParams.TERMS_FIELD, fields ); + + params.set( CommonParams.DISTRIB, "false" ); + params.set( ShardParams.IS_SHARD, true ); + params.set( ShardParams.SHARDS_PURPOSE, sreq.purpose ); + params.set( CommonParams.QT, termsHandler ); + params.set( TermsParams.TERMS, "true" ); + + if (rb.requestInfo != null) { + params.set("NOW", Long.toString(rb.requestInfo.getNOW().getTime())); + } + sreq.params = params; + + for (String shard : rb.shards ) { + Log.debug( "sending request to shard " + shard ); + params.set(ShardParams.SHARD_URL, shard ); + shardHandler.submit( sreq, shard, params ); + } + + ShardResponse rsp = shardHandler.takeCompletedIncludingErrors( ); + if (rsp != null) { + Log.debug( "got " + rsp.getShardRequest().responses.size( ) + " responses" ); + for ( ShardResponse srsp : rsp.getShardRequest().responses ) { + Log.debug( "Got terms response from " + srsp.getShard( )); + + if (srsp.getException() != null) { + Log.debug( "ShardResponse Exception!! " + srsp.getException( ) ); + } + + @SuppressWarnings("unchecked") + NamedList> terms = (NamedList>) srsp.getSolrResponse().getResponse().get("terms"); + if (terms != null) { + addTerms( terms, fieldBuilder, termBuilder, searchFields ); + } + else { + Log.warn( "terms was NULL! - make sure that /terms request handler is defined in solrconfig.xml" ); + } + } + } + } + } + + private void addTerms( NamedList> terms, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder, ArrayList searchFields ) throws IOException { + TermsResponse termsResponse = new TermsResponse( terms ); + for (String fieldName : searchFields ) { + CharsRef fieldChars = new CharsRef( fieldName ); + List termList = termsResponse.getTerms( fieldName ); + if (termList != null) { + for (TermsResponse.Term tc : termList) { + String term = tc.getTerm(); + Log.debug( "Add distributed term: " + fieldName + " = " + term ); + addTerm( fieldChars, term, fieldBuilder, termBuilder ); + } + } + } + } + + + private ArrayList getSynonymsFor( SynonymMap synMap, String term ) throws IOException { + Log.debug( "getSynonymsFor '" + term + "'" ); + + FST fst = synMap.fst; + FST.BytesReader fstReader = fst.getBytesReader(); + FST.Arc scratchArc = new FST.Arc<>( ); + BytesRef scratchBytes = new BytesRef(); + CharsRefBuilder scratchChars = new CharsRefBuilder(); + ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + BytesRef pendingOutput = fst.outputs.getNoOutput(); + fst.getFirstArc( scratchArc ); + BytesRef matchOutput = null; + + String[] tokens = term.split( " " ); + for (int i = 0; i < tokens.length; i++) { + + int charPos = 0; + while( charPos < tokens[i].length() ) { + final int codePoint = tokens[i].codePointAt( charPos ); + if (fst.findTargetArc( codePoint, scratchArc, scratchArc, fstReader) == null) { + Log.debug( "No Synonym for " + term ); + return null; + } + + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); + charPos += Character.charCount(codePoint); + } + + if (scratchArc.isFinal()) { + matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); + } + + if (i < tokens.length-1 && fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) != null) { + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); + } + } + + if (matchOutput != null) { + ArrayList synonymLst = new ArrayList( ); + bytesReader.reset( matchOutput.bytes, matchOutput.offset, matchOutput.length ); + + final int code = bytesReader.readVInt(); + final int count = code >>> 1; + for( int outputIDX = 0; outputIDX < count; outputIDX++ ) { + synMap.words.get( bytesReader.readVInt(), scratchBytes); + scratchChars.copyUTF8Bytes(scratchBytes); + int lastStart = 0; + final int chEnd = lastStart + scratchChars.length(); + for( int chIDX = lastStart; chIDX <= chEnd; chIDX++ ) { + if (chIDX == chEnd || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) { + int outputLen = chIDX - lastStart; + assert outputLen > 0: "output contains empty string: " + scratchChars; + String synonym = new String( scratchChars.chars(), lastStart, outputLen ); + Log.debug( "got synonym '" + synonym + "'" ); + synonymLst.add( synonym ); + lastStart = chIDX + 1; + } + } + } + + return synonymLst; + } + + return null; + } + + + // assume English for now ... + private String getStemmed( String input ) { + char[] inputChars = input.toCharArray( ); + + int lastCh = stem( inputChars, inputChars.length ); + if (lastCh < inputChars.length) { + return new String( inputChars, 0, lastCh ); + } + + return input; + } + + // similar to EnglishMinimalStemmer - fixes "...hes" as in batches couches + public int stem(char s[], int len) { + if (len < 3 || s[len-1] != 's') + return len; + + switch(s[len-2]) { + case 'u': + case 's': return len; + case 'e': + if (len > 3 && s[len-3] == 'i' && s[len-4] != 'a' && s[len-4] != 'e') { + s[len - 3] = 'y'; + return len - 2; + } + if (len > 3 && s[len-3] == 'h') { + return len-2; + } + if (s[len-3] == 'i' || s[len-3] == 'a' || s[len-3] == 'o' || s[len-3] == 'e') + return len; /* intentional fallthrough */ + default: return len - 1; + } + } + + private ArrayList tokenize( String input ) throws IOException { + + Log.debug( "tokenize '" + input + "'" ); + ArrayList tokens = new ArrayList( ); + Tokenizer tk = getTokenizerImpl( input ); + + CharTermAttribute term = tk.addAttribute( CharTermAttribute.class ); + tk.reset( ); + while (tk.incrementToken( ) ) { + int bufLen = term.length(); + char[] copy = new char[ bufLen ]; + System.arraycopy(term.buffer( ), 0, copy, 0, bufLen ); + tokens.add( copy ); + } + + return tokens; + } + + private Tokenizer getTokenizerImpl( String input ) throws IOException { + StandardTokenizer sttk = new StandardTokenizer( ); + sttk.setReader( new StringReader( input ) ); + return sttk; + } + + @Override + public void process(ResponseBuilder rb) throws IOException + { + // do nothing - needed so we don't execute the query here. + } + + // =========================================================================== + // Verb Modifier Code + // Using the verb modifier map if a verb modifier is adjacent to a field mapped phrase (can have noise words between) + // restrict the field names in the list to the one that is linked to the verb modifier + // TODO - how to deal with 'and' and 'or' Between modifiers + // =========================================================================== + private void filterFieldMap( ArrayList queryTokens, HashMap> fieldMap, + HashMap entityPositionMap, HashMap fieldPositionMap ) { + + Log.info( "filterFieldMap" ); + // need to find the modifiers that are in THIS set of tokens by position, in the order used ... + ArrayList usedModifiers = getOrderedModifierPositions( queryTokens ); + if (usedModifiers == null || usedModifiers.size() == 0) { + return; // nothing to do ... + } + + // find the verb modifiers in the query tokens list + // need to keep track of 'next entity' and 'last entity' as we iterate + boolean remapped = false; + for (ModifierInstance modInstance : usedModifiers) { + if (modInstance.templateRule != null) applyModifierTemplateRule( entityPositionMap, fieldMap, modInstance.templateRule ); + + HashMap fieldNameKeys = getFieldKeysForFieldName( modInstance.modifierFields, fieldMap ); + if (fieldNameKeys != null) { + // find the entity just before (maximum pos before) or after (minimum pos after) the modifier phrase from entityPositionMap + // assumming here that the modifiers can work bi-directionally + // as in 'songs Paul McCartney composed' or 'songs Paul McCartney has written' vs. 'songs composed by Paul McCartney' + // or 'Bands Paul McCartney was in' vs. 'who was in the Who' + for (String fieldNameKey : fieldNameKeys.keySet() ) { + String modifierField = fieldNameKeys.get( fieldNameKey ); + + HashSet entityPhrases = findLastEntitiesBefore( entityPositionMap, modInstance, usedModifiers, fieldMap.get( fieldNameKey ) ); + if ( entityPhrases != null ) { + remapEntity( fieldNameKey, entityPhrases, modifierField, fieldMap, fieldPositionMap, entityPositionMap ); + remapped = true; + } + else { + entityPhrases = findFirstEntitiesAfter( entityPositionMap, modInstance, usedModifiers, fieldMap.get( fieldNameKey ) ); + if (entityPhrases != null) { + remapEntity( fieldNameKey, entityPhrases, modifierField, fieldMap, fieldPositionMap, entityPositionMap ); + remapped = true; + } + } + } + } + + // add any filter fields for the verbs: + if (remapped && modInstance.filterFields != null) { + Log.info( "checking verb modifiers for " + modInstance.modifierFields ); + for (String filtField : modInstance.filterFields.keySet( ) ) { + ArrayList valList = new ArrayList( ); + valList.add( modInstance.filterFields.get( filtField ) ); + Log.info( "setting verb filter: " + filtField + ":" + modInstance.filterFields.get( filtField ) ); + fieldMap.put( filtField, valList ); + fieldPositionMap.put( filtField, modInstance.modifierPos ); + } + } + } + } + + private ArrayList getOrderedModifierPositions( ArrayList queryTokens ) { + ArrayList modifiers = null; + int i = 0; + while (i < queryTokens.size( ) ) { + char[] token = queryTokens.get( i ); + ModifierDefinition modifier = findModifier( token ); + if (modifier != null && matchesModifier( modifier.modTokens, queryTokens, i )) { + Log.info( "Adding Modifier Instance '" + modifier.modifierPhrase + "'" ); + ModifierInstance modInst = new ModifierInstance( ); + modInst.modifierPhrase = modifier.modifierPhrase; + modInst.modifierFields = modifier.modifierFields; + Log.info( "fields: " ); + for (String modField : modifier.modifierFields ) { Log.info( " " + modField ); } + modInst.modifierPos = new int[2]; + modInst.modifierPos[0] = i; + modInst.modifierPos[1] = i + modifier.modTokens.length - 1; + + modInst.filterFields = modifier.filterFields; + modInst.templateRule = modifier.templateRule; + if (modifiers == null) modifiers = new ArrayList( ); + modifiers.add( modInst ); + i += modifier.modTokens.length; + } + else { + ++i; + } + } + + return modifiers; + } + + private ModifierDefinition findModifier( char[] queryToken ) { + for (ModifierDefinition modifier : verbModifierList ) { + if (modifier.modifierPhrase.startsWith( new String( queryToken ) )) { + return modifier; + } + } + return null; + } + + private boolean matchesModifier( String[] modTokens, ArrayList queryTokens, int start ) { + int i = 0; + while ( (start + i) < queryTokens.size( ) && i < modTokens.length ) { + String token = new String( queryTokens.get( start + i ) ); + if (!token.toLowerCase( ).equals( modTokens[i].toLowerCase( ))) return false; + if (++i == modTokens.length) return true; + } + return false; + } + + + private HashMap getFieldKeysForFieldName( ArrayList modifierFields, HashMap> fieldMap ) { + Log.info( "getFieldKeysForFieldName" ); + HashMap fieldKeys = null; + for (String modifierField : modifierFields ) { + Log.info( "testing modifierField: " + modifierField ); + for (String fieldNameList : fieldMap.keySet() ) { + Log.info( "testing fieldNameList: " + fieldNameList ); + String[] fields = fieldNameList.split( fieldSplitExpr ); + for (int i = 0; i < fields.length; i++) { + if ( fields[i].equals( modifierField )) { + if (fieldKeys == null) fieldKeys = new HashMap( ); + Log.info( "adding field Key " + fieldNameList + ": " + modifierField ); + fieldKeys.put( fieldNameList, modifierField ); + } + } + } + } + return fieldKeys; + } + + + + // find entities before the current mod pos but after the last one (if modPos is not first in the list of modifier positions) + // we also need to keep track of the operator (???) + private HashSet findLastEntitiesBefore( HashMap entityPositionMap, ModifierInstance modifier, + ArrayList usedModifiers, ArrayList fieldVals ) { + Log.info( "findLastEntitiesBefore" ); + HashSet entitySet = null; + int previousModifierPosition = -1; + int thisModPos = modifier.modifierPos[0]; + + for ( ModifierInstance mod : usedModifiers ) { + if (mod.modifierPos[1] < thisModPos ) { + previousModifierPosition = mod.modifierPos[1]; + break; + } + } + + for (String entityPhrase : entityPositionMap.keySet( ) ) { + Log.info( " testing " + entityPhrase ); + if (fieldVals.contains( entityPhrase)) { + int[] entityPos = entityPositionMap.get( entityPhrase ); + Log.info( "entity is at " + entityPos[0] + "," + entityPos[1] ); + Log.info( "mod is at " + thisModPos + " previous mod was " + previousModifierPosition ); + if (entityPos[1] < thisModPos && entityPos[0] > previousModifierPosition ) { + if (entitySet == null) entitySet = new HashSet( ); + Log.info( "adding " + entityPhrase ); + entitySet.add( entityPhrase ); + } + } + } + + return entitySet; + } + + // find entities after the current mod pos but before the next modifier + private HashSet findFirstEntitiesAfter( HashMap entityPositionMap, ModifierInstance modifier, + ArrayList usedModifiers, ArrayList fieldVals ) { + Log.info( "findFirstEntitiesAfter" ); + HashSet entitySet = null; + int nextModifierPosition = Integer.MAX_VALUE; + int thisModPos = modifier.modifierPos[1]; + + for (ModifierInstance mod : usedModifiers ) { + if (mod.modifierPos[0] > thisModPos ) { + nextModifierPosition = mod.modifierPos[0]; + break; + } + } + + for (String entityPhrase : entityPositionMap.keySet( ) ) { + Log.info( " testing " + entityPhrase ); + if (fieldVals.contains( entityPhrase)) { + int[] entityPos = entityPositionMap.get( entityPhrase ); + Log.info( "entity is at " + entityPos[0] + "," + entityPos[1] ); + Log.info( "mod is at " + thisModPos + " next mod is " + nextModifierPosition ); + if (entityPos[0] > thisModPos && entityPos[1] < nextModifierPosition ) { + if (entitySet == null) entitySet = new HashSet( ); + Log.info( "adding " + entityPhrase ); + entitySet.add( entityPhrase ); + } + } + } + + return entitySet; + } + + + private void remapEntity( String fieldNameKey, HashSet entityValues, String modifierField, + HashMap> fieldMap, HashMap fieldPositionMap, HashMap entityPositionMap ) { + // find the fieldMap key that contains the fieldName + ArrayList fieldVals = fieldMap.get( fieldNameKey ); + + boolean allMatch = true; + for (String fieldVal : fieldVals ) { + if (!entityValues.contains( fieldVal )) { + allMatch = false; + break; + } + } + + // if the field values in the fieldMap match the set of entity values -- remove the fieldNameKey and replace it with the modifierField in the map + if ( allMatch ) { + if (fieldNameKey.equals( modifierField )) return; + + fieldMap.remove( fieldNameKey ); + Log.info( "remapping: " + modifierField ); + for( String val : fieldVals ) { Log.info( " " + val ); } + fieldMap.put( modifierField, fieldVals ); + } + else { + // for a partial map - remove the field values in the fieldMap that are in the entityValues set, and create a new entry with modifierField => entityValues + ArrayList remaining = new ArrayList( ); + ArrayList modList = new ArrayList( ); + for (String fieldVal : fieldVals ) { + if (entityValues.contains( fieldVal )) { + modList.add( fieldVal ); + } + else { + remaining.add( fieldVal ); + } + } + + fieldMap.put( modifierField, modList ); + fieldPositionMap.put( modifierField, getPosArrayFor( modList, entityPositionMap ) ); + + fieldMap.put( fieldNameKey, remaining ); + fieldPositionMap.put( fieldNameKey, getPosArrayFor( remaining, entityPositionMap ) ); + } + } + + private void applyModifierTemplateRule( HashMap entityPositionMap, HashMap> fieldMap, ModifierTemplateRule modifierRule ) { + Log.info( "applyModifierTemplateRule" ); + // find entity_1_field - from field map - find entityPosition from values + ArrayList firstEntityList = findEntityList( fieldMap, modifierRule.entity_1_field ); + if (firstEntityList == null) return; + String firstFieldList = null; + String entityValue = null; + + for (String firstEntity : firstEntityList ) { + Log.info( "checking entity: " + firstEntity ); + int[] firstPos = entityPositionMap.get( firstEntity ); + int[] secondPos = entityPositionMap.get( modifierRule.entity_2_value ); + if (secondPos != null && (secondPos[0] == firstPos[1] + 1) && findEntityList( fieldMap, modifierRule.entity_2_field ) != null ) { + if (modifierRule.entity_1_value.equals( "_ENTITY_" )) { + Log.info( "'" + firstEntity + "' matches pattern" ); + entityValue = firstEntity; + ArrayList outputList = new ArrayList( ); + outputList.add( firstEntity ); + firstFieldList = findFieldList( fieldMap, modifierRule.entity_1_field ); + fieldMap.put( modifierRule.output_field, outputList ); + break; + } + } + } + + if ( firstFieldList != null ) { + // remove remapped entity field from field list + Log.info( "removing " + modifierRule.entity_1_field + " from " + firstFieldList ); + String[] fields = firstFieldList.split( "\\|" ); + StringBuilder stb = new StringBuilder( ); + for (int i = 0; i < fields.length; i++) { + if (fields[i].equals( modifierRule.entity_1_field) == false ) { + if (stb.length() > 0) stb.append( "," ); + stb.append( fields[i] ); + } + } + + // remove entityValue from fieldMap arrayList + if (stb.length() > 0) { + Log.info( "new field list: " + stb.toString( ) ); + ArrayList remainder = new ArrayList( ); + for (String firstEntity : firstEntityList ) { + if (firstEntity.equals( entityValue ) == false ) { + Log.info( "adding remaining value " + firstEntity ); + remainder.add( firstEntity ); + } + } + if (remainder.size( ) > 0) { + Log.info( "remainder fields: " + stb.toString( ) ); + fieldMap.put( stb.toString( ), remainder ); + } + + Log.info( "removing field: " + firstFieldList ); + fieldMap.remove( firstFieldList ); + } + } + } + + private ArrayList findEntityList( HashMap> fieldMap, String entityField ) { + for (String fieldList : fieldMap.keySet() ) { + if (fieldList.contains( entityField )) { + return fieldMap.get( fieldList ); + } + } + return null; + } + + private String findFieldList( HashMap> fieldMap, String entityField ) { + for (String fieldList : fieldMap.keySet() ) { + if (fieldList.contains( entityField )) { + return fieldList; + } + } + return null; + } + + private int[] getPosArrayFor( ArrayList entities, HashMap entityPositionMap ) { + int[] newPosArray = null; + for ( String entity : entities ) { + int[] entityPos = entityPositionMap.get( entity ); + if (entityPos != null) { + if (newPosArray == null) newPosArray = entityPos; + else { + if (entityPos[1] < newPosArray[0] ) { + newPosArray[0] = entityPos[0]; + } + if (entityPos[0] > newPosArray[1] ) { + newPosArray[1] = entityPos[1]; + } + } + } + } + + return newPosArray; + } + + private class ModifierDefinition + { + String modifierPhrase; // the phrase that will modify like 'was in' + ArrayList modifierFields; // the field(s) that will be used like 'memberOfGroup_ss,groupMembers_ss' + String[] modTokens; + HashMap filterFields; + ModifierTemplateRule templateRule; + } + + private class ModifierInstance + { + String modifierPhrase; + ArrayList modifierFields; + int[] modifierPos; + HashMap filterFields; + ModifierTemplateRule templateRule; + } + + // original_performer_s:_ENTITY_,recording_type_ss:Song=>original_performer_s:_ENTITY_ + private class ModifierTemplateRule + { + String entity_1_field; + String entity_1_value; + + String entity_2_field; + String entity_2_value; + + String output_field; + String output_value; + + ModifierTemplateRule( String templatePattern ) { + String leftSide = new String(templatePattern.substring( 0, templatePattern.indexOf( "=>" ))); + String rightSide = new String(templatePattern.substring( templatePattern.indexOf( "=>" ) + 2 )); + + String entity_1 = new String( leftSide.substring( 0, leftSide.indexOf( "," ))); + String entity_2 = new String( leftSide.substring( leftSide.indexOf( "," ) + 1 )); + + entity_1_field = new String( entity_1.substring( 0, entity_1.indexOf( ":" ))); + entity_1_value = new String( entity_1.substring( entity_1.indexOf( ":" ) + 1 )); + entity_2_field = new String( entity_2.substring( 0, entity_2.indexOf( ":" ))); + entity_2_value = new String( entity_2.substring( entity_2.indexOf( ":" ) + 1 )); + + output_field = new String( rightSide.substring( 0, rightSide.indexOf( ":" ))); + output_value = new String( rightSide.substring( rightSide.indexOf( ":" ) + 1 )); + + Log.info( "entity_1_field: " + entity_1_field + " entity_1_value: " + entity_1_value ); + Log.info( "entity_2_field: " + entity_2_field + " entity_2_value: " + entity_2_value ); + Log.info( "output_field: " + output_field + " output_value: " + output_value ); + } + } + +} diff --git a/solr6.x/src/test/com/lucidworks/solr/handler/component/DistributedQueryAutoFilteringTest.java b/solr6.x/src/test/com/lucidworks/solr/handler/component/DistributedQueryAutoFilteringTest.java new file mode 100644 index 0000000..1cbba6a --- /dev/null +++ b/solr6.x/src/test/com/lucidworks/solr/handler/component/DistributedQueryAutoFilteringTest.java @@ -0,0 +1,75 @@ +package org.apache.solr.handler.component; + +import org.apache.solr.BaseDistributedSearchTestCase; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.junit.BeforeClass; +import org.junit.Test; + + +public class DistributedQueryAutoFilteringTest extends BaseDistributedSearchTestCase { + + public DistributedQueryAutoFilteringTest() { + stress = 0; + } + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + initCore( "solrconfig-autofilter.xml", "schema-autofilter.xml" ); + } + + @Test + @ShardsFixed(num = 3) + public void test() throws Exception { + del("*:*"); + + index( id, "1", "color", "red", "product", "shoes" ); + index( id, "2", "color", "red", "product", "socks" ); + index( id, "3", "color", "brown", "product", "socks" ); + index( id, "4", "color", "green", "brand", "red lion", "product", "socks" ); + index( id, "5", "color", "blue", "brand", "red lion", "product", "socks" ); + index( id, "6", "color", "blue", "brand", "red dragon", "product", "socks" ); + index( id, "7", "brand", "red baron", "product", "pizza" ); + index( id, "8", "brand", "red label", "product", "whiskey" ); + index( id, "9", "brand", "red light", "product", "smoke detector" ); + index( id, "10", "brand", "red star", "product", "yeast" ); + index( id, "11", "brand", "gallo", "product", "red wine" ); + index( id, "12", "brand", "heinz", "product", "red wine vinegar" ); + index( id, "13", "brand", "dole", "product", "red grapes" ); + index( id, "14", "brand", "acme", "product", "red brick" ); + commit(); + + handle.put("distrib", SKIP); + handle.put("shards", SKIP); + + QueryResponse rsp; + rsp = query( CommonParams.Q, "red lion socks", "fl", "id", "rows", 20, "qt", "/select", "sort", "id asc" ); + assertFieldValues(rsp.getResults(), id, "1", "10", "11", "12", "13", "14", "2", "3", "4", "5", "6", "7", "8", "9" ); + + rsp = query( CommonParams.Q, "red lion socks", "fl", "id", "qt", "/autofilter", "sort", "id asc" ); + assertFieldValues(rsp.getResults(), id, "4", "5" ); + + rsp = query( CommonParams.Q, "blue red lion socks", "fl", "id", "qt", "/autofilter" ); + assertFieldValues(rsp.getResults(), id, "5" ); + + rsp = query( CommonParams.Q, "red wine", "fl", "id", "qt", "/autofilter" ); + assertFieldValues(rsp.getResults(), id, "11" ); + + rsp = query( CommonParams.Q, "red wine vinegar", "fl", "id", "qt", "/autofilter" ); + assertFieldValues(rsp.getResults(), id, "12" ); + } + + @Override + protected QueryResponse query(Object... q) throws Exception { + + final ModifiableSolrParams params = new ModifiableSolrParams(); + + for (int i = 0; i < q.length; i += 2) { + params.add(q[i].toString(), q[i + 1].toString()); + } + params.set("shards", getShardsString()); + + return queryServer(params); + } +} diff --git a/solr6.x/src/test/com/lucidworks/solr/handler/component/QueryAutoFilteringComponentTest.java b/solr6.x/src/test/com/lucidworks/solr/handler/component/QueryAutoFilteringComponentTest.java new file mode 100644 index 0000000..e5d3db4 --- /dev/null +++ b/solr6.x/src/test/com/lucidworks/solr/handler/component/QueryAutoFilteringComponentTest.java @@ -0,0 +1,486 @@ +package org.apache.solr.handler.component; + +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.params.CommonParams; + +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +public class QueryAutoFilteringComponentTest extends SolrTestCaseJ4 { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-autofilter.xml","schema-autofilter.xml"); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + } + + @Override + public void tearDown() throws Exception { + super.tearDown(); + } + + @Test + public void testColors( ) { + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "shoes" )); + assertU(adoc("id", "2", "color", "Red", "product", "socks" )); + assertU(adoc("id", "3", "color", "brown", "product", "socks" )); + assertU(adoc("id", "4", "color", "green", "brand", "red lion", "product", "socks")); + assertU(adoc("id", "5", "color", "blue", "brand", "green dragon", "product", "socks" )); + assertU(adoc("id", "6", "color", "black", "brand", "buster brown", "product", "shoes" )); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "red lion socks", CommonParams.QT, "/select" ) + , "//*[@numFound='5']" + , "//doc[./str[@name='id']='4']" + , "//doc[./str[@name='id']='2']" + , "//doc[./str[@name='id']='1']" + , "//doc[./str[@name='id']='3']" + , "//doc[./str[@name='id']='5']"); + + assertQ("", req(CommonParams.Q, "red socks", CommonParams.QT, "/select" ) + , "//*[@numFound='5']" + , "//doc[./str[@name='id']='2']" + , "//doc[./str[@name='id']='4']" + , "//doc[./str[@name='id']='1']" + , "//doc[./str[@name='id']='3']" + , "//doc[./str[@name='id']='5']"); + + assertQ("", req(CommonParams.Q, "red lion socks", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='4']" ); + + assertQ("", req(CommonParams.Q, "red socks", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='2']" ); + + assertQ("", req(CommonParams.Q, "brown shoes", CommonParams.QT, "/select" ) + , "//*[@numFound='3']" + , "//doc[./str[@name='id']='1']" + , "//doc[./str[@name='id']='3']" + , "//doc[./str[@name='id']='6']"); + + assertQ("", req(CommonParams.Q, "brown shoes", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='0']" ); + + } + + @Test + public void testSynonyms( ) { + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "chaise lounge" )); + assertU(adoc("id", "2", "color", "red", "product", "sofa" )); + assertU(adoc("id", "3", "color", "red", "product", "chair" )); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "red couch", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='2']" ); + + assertQ("", req(CommonParams.Q, "rouge sofa", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='2']" ); + + assertQ("", req(CommonParams.Q, "red lounge chair", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='1']" ); + + assertQ("", req(CommonParams.Q, "rouge lounge chair", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='1']" ); + + assertQ("", req(CommonParams.Q, "crimson day bed", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='1']" ); + } + + @Test + public void testCaseInsensitive( ) { + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "shoes" )); + assertU(adoc("id", "2", "color", "red", "product", "socks" )); + assertU(adoc("id", "3", "color", "brown", "product", "socks" )); + assertU(adoc("id", "4", "color", "green", "brand", "Red Lion", "product", "socks")); + assertU(adoc("id", "5", "color", "blue", "brand", "Green Dragon", "product", "socks" )); + assertU(adoc("id", "6", "color", "black", "brand", "Buster Brown", "product", "shoes" )); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "red lion socks", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='4']" ); + + assertQ("", req(CommonParams.Q, "Red Lion socks", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='4']" ); + } + + @Test + public void testSynonymsCaseInsensitive( ) { + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "Chaise Lounge" )); + assertU(adoc("id", "2", "color", "red", "product", "sofa" )); + assertU(adoc("id", "3", "color", "red", "product", "chair" )); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "red lounge chair", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='1']" ); + + assertQ("", req(CommonParams.Q, "scarlet Lounge Chair", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='1']" ); + + assertQ("", req(CommonParams.Q, "Crimson Couch", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='2']" ); + + } + + + @Test + public void testStemming( ) { + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "shirt" )); + assertU(adoc("id", "2", "color", "red", "product", "socks" )); + assertU(adoc("id", "3", "color", "red", "product", "pants" )); + assertU(adoc("id", "4", "color", "red", "product", "sofa" )); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "red shirts", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='1']" ); + + assertQ("", req(CommonParams.Q, "red shirt", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='1']" ); + + assertQ("", req(CommonParams.Q, "red couches", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='4']" ); + } + + @Test + public void testMinTokens( ) { + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "shoes" )); + assertU(adoc("id", "2", "color", "red", "product", "socks" )); + assertU(adoc("id", "3", "color", "green", "brand", "red lion", "product", "socks")); + assertU(adoc("id", "4", "brand", "red label", "product", "whiskey")); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "red", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='2']" + , "//doc[./str[@name='id']='1']" + , "//doc[./str[@name='id']='2']" ); + + assertQ("", req(CommonParams.Q, "red", CommonParams.QT, "/autofilter", "mt", "2" ) + , "//*[@numFound='4']" ); + + assertQ("", req(CommonParams.Q, "red shoes", CommonParams.QT, "/autofilter", "mt", "2" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='1']" ); + } + + @Test + public void testBoostFilter( ) { + // use autofilter handler configured with boostFactor + clearIndex(); + assertU(commit()); + assertU(adoc( "id", "1", "color", "red", "product", "shoes" )); + assertU(adoc( "id", "2", "color", "red", "product", "socks" )); + assertU(adoc( "id", "3", "color", "brown", "product", "socks" )); + assertU(adoc( "id", "4", "color", "green", "brand", "red lion", "product", "socks" )); + assertU(adoc( "id", "5", "color", "blue", "brand", "red lion", "product", "socks" )); + assertU(adoc( "id", "6", "color", "blue", "brand", "red dragon", "product", "socks" )); + assertU(adoc( "id", "7", "brand", "red baron", "product", "pizza" )); + assertU(adoc( "id", "8", "brand", "red label", "product", "whiskey" )); + assertU(adoc( "id", "9", "brand", "red light", "product", "smoke detector" )); + assertU(adoc( "id", "10", "brand", "red star", "product", "yeast" )); + assertU(adoc( "id", "11", "brand", "gallo", "product", "red wine" )); + assertU(adoc( "id", "12", "brand", "heinz", "product", "red wine vinegar" )); + assertU(adoc( "id", "13", "brand", "dole", "product", "red grapes" )); + assertU(adoc( "id", "14", "brand", "acme", "product", "red brick" )); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "blue red dragon socks", CommonParams.QT, "/autofilterBQ", "rows", "20" ) + , "//*[@numFound='14']" + , "//doc[./str[@name='id']='6']" + , "//doc[./str[@name='id']='5']" + , "//doc[./str[@name='id']='2']" + , "//doc[./str[@name='id']='4']" + , "//doc[./str[@name='id']='3']" + , "//doc[./str[@name='id']='1']" + , "//doc[./str[@name='id']='7']" + , "//doc[./str[@name='id']='8']" + , "//doc[./str[@name='id']='9']" + , "//doc[./str[@name='id']='10']" + , "//doc[./str[@name='id']='11']" + , "//doc[./str[@name='id']='12']" + , "//doc[./str[@name='id']='13']" + , "//doc[./str[@name='id']='14']" ); + } + + @Test + public void testExcludeFields( ) { + // use autofilter handler configured with excludeFields + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "shoes" )); + assertU(adoc("id", "2", "color", "red", "product", "socks" )); + assertU(adoc("id", "3", "color", "green", "brand", "red lion", "product", "socks")); + assertU(adoc("id", "4", "brand", "red label", "product", "whiskey")); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "1", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='1']" ); + + // removes 'id' as an autofilter field + assertQ("", req(CommonParams.Q, "1", CommonParams.QT, "/autofilterEX" ) + , "//*[@numFound='0']" ); + + } + + @Test + public void testStopWords( ) { + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "shoes" )); + assertU(adoc("id", "2", "color", "red", "product", "socks" )); + assertU(adoc("id", "3", "color", "green", "brand", "red lion", "product", "socks")); + assertU(adoc("id", "4", "color", "red", "brand", "calvin klein", "product", "underwear")); + assertU(adoc("id", "5", "color", "red", "brand", "fruit of the loom", "product", "underwear")); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "red calvin klein underwear", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='4']" ); + + // stop words should be removed: 'by' is not part of a brand name phrase + assertQ("", req(CommonParams.Q, "red underwear by calvin klein", CommonParams.QT, "/autofilterSW" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='4']"); + + // stop words should not be removed from within a matching phrase + assertQ("", req(CommonParams.Q, "red fruit of the loom underwear", CommonParams.QT, "/autofilterSW" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='5']"); + } + + @Test + public void testRandomOrder( ) { + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "shoes" )); + assertU(adoc("id", "2", "color", "red", "product", "socks" )); + assertU(adoc("id", "3", "color", "green", "brand", "red lion", "product", "socks")); + assertU(adoc("id", "4", "brand", "red label", "product", "whiskey")); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "red lion socks", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='3']" ); + + assertQ("", req(CommonParams.Q, "socks red lion", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='3']" ); + } + + @Test + public void testBadQueries( ) { + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "shoes" )); + assertU(adoc("id", "2", "color", "red", "product", "socks" )); + assertU(adoc("id", "3", "color", "green", "brand", "red lion", "product", "socks")); + assertU(adoc("id", "4", "brand", "red label", "product", "whiskey")); + assertU(adoc("id", "5", "color", "blue", "brand", "green dragon", "product", "socks")); + assertU(commit()); + + // green red tiger socks -> tiger (color:(green OR red) AND product:socks) + assertQ("", req(CommonParams.Q, "green red tiger socks", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='2']" + , "//doc[./str[@name='id']='2']" + , "//doc[./str[@name='id']='3']"); + + // green red lion socks blahblah -> blahblah (color:green AND brand:"red lion" AND product:socks) + assertQ("", req(CommonParams.Q, "green red lion socks blahblah", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='3']" ); + } + + @Test + public void testMultipleFieldValues( ) { + clearIndex(); + assertU(commit()); + assertU(adoc("id", "1", "color", "red", "product", "shoes" )); + assertU(adoc("id", "2", "color", "red", "product", "socks" )); + assertU(adoc("id", "3", "color", "green", "brand", "red lion", "product", "socks")); + assertU(adoc("id", "4", "brand", "red label", "product", "whiskey")); + assertU(adoc("id", "5", "color", "blue", "brand", "green dragon", "product", "socks")); + assertU(commit()); + + // should create filter query: color:(red OR green) product:socks + assertQ("", req(CommonParams.Q, "red green socks", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='2']" + , "//doc[./str[@name='id']='2']" + , "//doc[./str[@name='id']='3']"); + } + + @Test + public void testMultipleFieldNames( ) { + clearIndex(); + assertU(commit()); + //assertU(adoc("id", "1", "first_name", "Tucker", "last_name", "Thomas", "full_name", "Tucker Thomas")); + //assertU(adoc("id", "2", "first_name", "Thomas", "last_name", "Tucker", "full_name", "Thomas Tucker")); + assertU(adoc("id", "1", "full_name", "Tucker Thomas", "text", "Tucker Thomas")); + assertU(adoc("id", "2", "full_name", "Thomas Tucker", "text", "Thomas Tucker")); + assertU(commit()); + + // should create filter query (first_name:thomas OR last_name:thomas) + assertQ("", req(CommonParams.Q, "Thomas", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='2']" ); + + // uses longer contiguous phrase for full_name - creates fq=full_name:"thomas tucker" + // this breaks now because of "fix" for testAmbiguousFields + assertQ("", req(CommonParams.Q, "Thomas Tucker", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" + , "//doc[./str[@name='id']='2']"); + } + + @Test + public void testMultiValuedField( ) { + clearIndex(); + assertU(commit()); + assertU( multiValueDocs ); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "fast stylish", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" ); + + assertQ("", req(CommonParams.Q, "fast and stylish", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" ); + + assertQ("", req(CommonParams.Q, "fast or stylish", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='3']" ); + } + + @Test + public void testAmbiguousFields( ) { + clearIndex(); + assertU(commit()); + assertU( whiteAmbiguousDocs ); + assertU(commit()); + + // should create (brand_s:"white linen" OR (color:white AND material_s:linen)) + assertQ("", req(CommonParams.Q, "white linen", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='3']" ); + + assertQ("", req(CommonParams.Q, "white linen perfume", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" ); + + assertQ("", req(CommonParams.Q, "white linen shirt", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='2']" ); + + assertQ("", req(CommonParams.Q, "mens white linen shirt", CommonParams.QT, "/autofilter" ) + , "//*[@numFound='1']" ); + + } + + + @Test + public void testVerbMappings( ) { + clearIndex(); + assertU(commit()); + assertU( musicDocs ); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "Bob Dylan Songs", CommonParams.QT, "/autofilterVRB" ) + , "//*[@numFound='3']" ); + + assertQ("", req(CommonParams.Q, "Songs Bob Dylan wrote", CommonParams.QT, "/autofilterVRB" ) + , "//*[@numFound='2']" ); + + assertQ("", req(CommonParams.Q, "Songs Bob Dylan performed", CommonParams.QT, "/autofilterVRB" ) + , "//*[@numFound='2']" ); + + assertQ("", req(CommonParams.Q, "Songs Bob Dylan covered", CommonParams.QT, "/autofilterVRB" ) + , "//*[@numFound='1']" ); + + } + + @Test + public void testNounPhraseMappings( ) { + clearIndex(); + assertU(commit()); + assertU( beatlesDocs ); + assertU(commit()); + + assertQ("", req(CommonParams.Q, "Beatles Songs", CommonParams.QT, "/autofilterVRB" ) + , "//*[@numFound='3']" ); + + assertQ("", req(CommonParams.Q, "Beatles Songs covered", CommonParams.QT, "/autofilterVRB" ) + , "//*[@numFound='2']" ); + + assertQ("", req(CommonParams.Q, "Beatles Songs covered by Joan Baez", CommonParams.QT, "/autofilterVRB" ) + , "//*[@numFound='1']" ); + + assertQ("", req(CommonParams.Q, "Songs Beatles covered", CommonParams.QT, "/autofilterVRB" ) + , "//*[@numFound='1']" ); + } + + + private static String multiValueDocs = "1fast" + + "stylish" + + "2fast" + + "powerful" + + "3stylish"; + + private static String whiteAmbiguousDocs = "1perfume" + + "fragrencesWhite Linen" + + "womens" + + "2dress shirt" + + "shirtWhite" + + "Linenwomens" + + "3dress shirt" + + "shirtWhite" + + "Linenmens"; + + private static String musicDocs = "1All Along the Watchtower" + + "Bob DylanJimi Hendrix" + + "SongCover" + + "2The Mighty Quinn" + + "Bob DylanBob Dylan" + + "SongOriginal" + + "3This Land is Your Land" + + "Woody GuthrieBob Dylan" + + "SongCover"; + + private static String beatlesDocs = "1Let It Be" + + "Beatles" + + "Joan Baez" + + "Cover" + + "Song" + + "2Something" + + "Beatles" + + "Frank Sinatra" + + "Cover" + + "Song" + + "3Honey Don't" + + "Carl Perkins" + + "Beatles" + + "Cover" + + "Song"; + +} \ No newline at end of file diff --git a/solr6.x/src/test/resources/solr/collection1/conf/currency.xml b/solr6.x/src/test/resources/solr/collection1/conf/currency.xml new file mode 100644 index 0000000..6a12b32 --- /dev/null +++ b/solr6.x/src/test/resources/solr/collection1/conf/currency.xml @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + diff --git a/solr6.x/src/test/resources/solr/collection1/conf/managed-schema b/solr6.x/src/test/resources/solr/collection1/conf/managed-schema new file mode 100644 index 0000000..f16a44b --- /dev/null +++ b/solr6.x/src/test/resources/solr/collection1/conf/managed-schema @@ -0,0 +1,126 @@ + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/solr6.x/src/test/resources/solr/collection1/conf/schema-autofilter.xml.bak b/solr6.x/src/test/resources/solr/collection1/conf/schema-autofilter.xml.bak new file mode 100644 index 0000000..51269dc --- /dev/null +++ b/solr6.x/src/test/resources/solr/collection1/conf/schema-autofilter.xml.bak @@ -0,0 +1,273 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr6.x/src/test/resources/solr/collection1/conf/solrconfig-autofilter.xml b/solr6.x/src/test/resources/solr/collection1/conf/solrconfig-autofilter.xml new file mode 100644 index 0000000..785a177 --- /dev/null +++ b/solr6.x/src/test/resources/solr/collection1/conf/solrconfig-autofilter.xml @@ -0,0 +1,242 @@ + + + + + + ${tests.luceneMatchVersion:LATEST} + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + 1024 + + + + + + + + + + true + + 10 + + + + + + + + + + + + + + explicit + 10 + text + + + + + + + explicit + text + + + autofilter + + + + + synonyms-autofilter.txt + + + + + explicit + text + + + autofilterSW + + + + + stopwords.txt + + + + + + explicit + text + + + autofilterBQ + + + + + 100 + + + + + + explicit + text + 2 + + + autofilter + + + + + + + explicit + text + + + autofilterEX + + + + + + id + + + + + + explicit + text + + + autofilterVRB + + + + + + written,wrote,composed:composer_s + performed,played,sang,recorded:performer_s + covered,covers:performer_s|version_s:Cover|original_performer_s:_ENTITY_,recording_type_s:Song=>original_performer_s:_ENTITY_ + + + + + + + + + termsComp + + + + + + + + max-age=30, public + + + + + solr + solrconfig.xml schema.xml admin-extra.html + + + + prefix-${solr.test.sys.prop2}-suffix + + + diff --git a/solr6.x/src/test/resources/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml b/solr6.x/src/test/resources/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml new file mode 100644 index 0000000..13f8214 --- /dev/null +++ b/solr6.x/src/test/resources/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml @@ -0,0 +1,47 @@ + + + + + + + + + ${useCompoundFile:false} + + ${solr.tests.maxBufferedDocs} + ${solr.tests.maxIndexingThreads:8} + ${solr.tests.ramBufferSizeMB} + + + + 1000 + 10000 + + + ${solr.tests.lockType:single} + diff --git a/solr6.x/src/test/resources/solr/collection1/conf/stopwords.txt b/solr6.x/src/test/resources/solr/collection1/conf/stopwords.txt new file mode 100644 index 0000000..b5824da --- /dev/null +++ b/solr6.x/src/test/resources/solr/collection1/conf/stopwords.txt @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +#Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +s +such +t +that +the +their +then +there +these +they +this +to +was +will +with + diff --git a/solr6.x/src/test/resources/solr/collection1/conf/synonyms-autofilter.txt b/solr6.x/src/test/resources/solr/collection1/conf/synonyms-autofilter.txt new file mode 100644 index 0000000..367e7a6 --- /dev/null +++ b/solr6.x/src/test/resources/solr/collection1/conf/synonyms-autofilter.txt @@ -0,0 +1,17 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +chaise lounge,lounge chair,daybed,day bed +red,rouge,crimson,scarlet +couch,sofa + diff --git a/solr6.x/src/test/resources/solr/collection1/conf/synonyms.txt b/solr6.x/src/test/resources/solr/collection1/conf/synonyms.txt new file mode 100644 index 0000000..b0e31cb --- /dev/null +++ b/solr6.x/src/test/resources/solr/collection1/conf/synonyms.txt @@ -0,0 +1,31 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaa => aaaa +bbb => bbbb1 bbbb2 +ccc => cccc1,cccc2 +a\=>a => b\=>b +a\,a => b\,b +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma +