Skip to content

Commit a6eb368

Browse files
peteralfonsiPeter Alfonsi
andauthored
BooleanQuery rewrite for must_not RangeQuery clauses (#17655)
--------- Signed-off-by: Peter Alfonsi <[email protected]> Signed-off-by: Peter Alfonsi <[email protected]> Co-authored-by: Peter Alfonsi <[email protected]>
1 parent d2b3ba0 commit a6eb368

File tree

5 files changed

+542
-1
lines changed

5 files changed

+542
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
3131
- Use QueryCoordinatorContext for the rewrite in validate API. ([#18272](https://github.com/opensearch-project/OpenSearch/pull/18272))
3232
- Upgrade crypto kms plugin dependencies for AWS SDK v2.x. ([#18268](https://github.com/opensearch-project/OpenSearch/pull/18268))
3333
- Add support for `matched_fields` with the unified highlighter ([#18164](https://github.com/opensearch-project/OpenSearch/issues/18164))
34+
- Add BooleanQuery rewrite for must_not RangeQuery clauses ([#17655](https://github.com/opensearch-project/OpenSearch/pull/17655))
3435
- [repository-s3] Add support for SSE-KMS and S3 bucket owner verification ([#18312](https://github.com/opensearch-project/OpenSearch/pull/18312))
3536
- Optimize gRPC perf by passing by reference ([#18303](https://github.com/opensearch-project/OpenSearch/pull/18303))
3637
- Added File Cache Stats - Involves Block level as well as full file level stats ([#17538](https://github.com/opensearch-project/OpenSearch/issues/17479))
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.search.query;
10+
11+
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
12+
13+
import org.opensearch.common.settings.Settings;
14+
import org.opensearch.test.ParameterizedStaticSettingsOpenSearchIntegTestCase;
15+
16+
import java.util.Arrays;
17+
import java.util.Collection;
18+
19+
import static org.opensearch.index.query.QueryBuilders.boolQuery;
20+
import static org.opensearch.index.query.QueryBuilders.matchAllQuery;
21+
import static org.opensearch.index.query.QueryBuilders.rangeQuery;
22+
import static org.opensearch.index.query.QueryBuilders.termQuery;
23+
import static org.opensearch.search.SearchService.CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING;
24+
import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertHitCount;
25+
26+
public class BooleanQueryIT extends ParameterizedStaticSettingsOpenSearchIntegTestCase {
27+
28+
public BooleanQueryIT(Settings staticSettings) {
29+
super(staticSettings);
30+
}
31+
32+
@ParametersFactory
33+
public static Collection<Object[]> parameters() {
34+
return Arrays.asList(
35+
new Object[] { Settings.builder().put(CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING.getKey(), false).build() },
36+
new Object[] { Settings.builder().put(CLUSTER_CONCURRENT_SEGMENT_SEARCH_SETTING.getKey(), true).build() }
37+
);
38+
}
39+
40+
public void testMustNotRangeRewrite() throws Exception {
41+
String intField = "int_field";
42+
String termField1 = "term_field_1";
43+
String termField2 = "term_field_2";
44+
// If we don't explicitly set this mapping, term_field_2 is not correctly mapped as a keyword field and queries don't work as
45+
// expected
46+
createIndex(
47+
"test",
48+
Settings.EMPTY,
49+
"{\"properties\":{\"int_field\":{\"type\": \"integer\"},\"term_field_1\":{\"type\": \"keyword\"},\"term_field_2\":{\"type\": \"keyword\"}}}}"
50+
);
51+
int numDocs = 100;
52+
53+
for (int i = 0; i < numDocs; i++) {
54+
String termValue1 = "odd";
55+
String termValue2 = "A";
56+
if (i % 2 == 0) {
57+
termValue1 = "even";
58+
}
59+
if (i >= numDocs / 2) {
60+
termValue2 = "B";
61+
}
62+
client().prepareIndex("test")
63+
.setId(Integer.toString(i))
64+
.setSource(intField, i, termField1, termValue1, termField2, termValue2)
65+
.get();
66+
}
67+
ensureGreen();
68+
waitForRelocation();
69+
forceMerge();
70+
refresh();
71+
72+
int lt = 80;
73+
int gte = 20;
74+
int expectedHitCount = numDocs - (lt - gte);
75+
76+
assertHitCount(
77+
client().prepareSearch().setQuery(boolQuery().mustNot(rangeQuery(intField).lt(lt).gte(gte))).get(),
78+
expectedHitCount
79+
);
80+
81+
assertHitCount(
82+
client().prepareSearch()
83+
.setQuery(boolQuery().must(termQuery(termField1, "even")).mustNot(rangeQuery(intField).lt(lt).gte(gte)))
84+
.get(),
85+
expectedHitCount / 2
86+
);
87+
88+
// Test with preexisting should fields, to ensure the original default minimumShouldMatch is respected
89+
// once we add a must clause during rewrite, even if minimumShouldMatch is not explicitly set
90+
assertHitCount(
91+
client().prepareSearch().setQuery(boolQuery().should(termQuery(termField1, "even")).should(termQuery(termField2, "B"))).get(),
92+
3 * numDocs / 4
93+
);
94+
assertHitCount(
95+
client().prepareSearch()
96+
.setQuery(
97+
boolQuery().should(termQuery(termField1, "even"))
98+
.should(termQuery(termField2, "A"))
99+
.mustNot(rangeQuery(intField).lt(lt).gte(gte))
100+
)
101+
.get(),
102+
3 * expectedHitCount / 4
103+
);
104+
105+
assertHitCount(client().prepareSearch().setQuery(boolQuery().mustNot(rangeQuery(intField).lt(numDocs * 2))).get(), 0);
106+
}
107+
108+
public void testMustNotDateRangeRewrite() throws Exception {
109+
int minDay = 10;
110+
int maxDay = 31;
111+
String dateField = "date_field";
112+
createIndex("test");
113+
for (int day = minDay; day <= maxDay; day++) {
114+
client().prepareIndex("test").setSource(dateField, getDate(day, 0)).get();
115+
}
116+
ensureGreen();
117+
waitForRelocation();
118+
forceMerge();
119+
refresh();
120+
121+
int minExcludedDay = 15;
122+
int maxExcludedDay = 25;
123+
int expectedHitCount = maxDay + 1 - minDay - (maxExcludedDay - minExcludedDay + 1);
124+
125+
assertHitCount(
126+
client().prepareSearch("test")
127+
.setQuery(boolQuery().mustNot(rangeQuery(dateField).gte(getDate(minExcludedDay, 0)).lte(getDate(maxExcludedDay, 0))))
128+
.get(),
129+
expectedHitCount
130+
);
131+
}
132+
133+
public void testMustNotDateRangeWithFormatAndTimeZoneRewrite() throws Exception {
134+
// Index a document for each hour of 1/1 in UTC. Then, do a boolean query excluding 1/1 for UTC+3.
135+
// If the time zone is respected by the rewrite, we should see 3 matching documents,
136+
// as there will be 3 hours that are 1/1 in UTC but not UTC+3.
137+
138+
String dateField = "date_field";
139+
createIndex("test");
140+
String format = "strict_date_hour_minute_second_fraction";
141+
142+
for (int hour = 0; hour < 24; hour++) {
143+
client().prepareIndex("test").setSource(dateField, getDate(1, hour)).get();
144+
}
145+
ensureGreen();
146+
waitForRelocation();
147+
forceMerge();
148+
refresh();
149+
150+
int zoneOffset = 3;
151+
assertHitCount(
152+
client().prepareSearch("test")
153+
.setQuery(
154+
boolQuery().mustNot(
155+
rangeQuery(dateField).gte(getDate(1, 0)).lte(getDate(1, 23)).timeZone("+" + zoneOffset).format(format)
156+
)
157+
)
158+
.get(),
159+
zoneOffset
160+
);
161+
}
162+
163+
public void testMustNotRangeRewriteWithMissingValues() throws Exception {
164+
// This test passes because the rewrite is skipped when not all docs have exactly 1 value
165+
String intField = "int_field";
166+
String termField = "term_field";
167+
createIndex("test");
168+
int numDocs = 100;
169+
int numDocsMissingIntValues = 20;
170+
171+
for (int i = 0; i < numDocs; i++) {
172+
String termValue = "odd";
173+
if (i % 2 == 0) {
174+
termValue = "even";
175+
}
176+
177+
if (i >= numDocsMissingIntValues) {
178+
client().prepareIndex("test").setId(Integer.toString(i)).setSource(intField, i, termField, termValue).get();
179+
} else {
180+
client().prepareIndex("test").setId(Integer.toString(i)).setSource(termField, termValue).get();
181+
}
182+
}
183+
ensureGreen();
184+
waitForRelocation();
185+
forceMerge();
186+
refresh();
187+
188+
// Search excluding range 30 to 50
189+
190+
int lt = 50;
191+
int gte = 30;
192+
int expectedHitCount = numDocs - (lt - gte); // Expected hit count includes documents missing this value
193+
194+
assertHitCount(
195+
client().prepareSearch().setQuery(boolQuery().mustNot(rangeQuery(intField).lt(lt).gte(gte))).get(),
196+
expectedHitCount
197+
);
198+
}
199+
200+
public void testMustNotRangeRewriteWithMoreThanOneValue() throws Exception {
201+
// This test passes because the rewrite is skipped when not all docs have exactly 1 value
202+
String intField = "int_field";
203+
createIndex("test");
204+
int numDocs = 100;
205+
int numDocsWithExtraValue = 20;
206+
int extraValue = -1;
207+
208+
for (int i = 0; i < numDocs; i++) {
209+
if (i < numDocsWithExtraValue) {
210+
client().prepareIndex("test").setId(Integer.toString(i)).setSource(intField, new int[] { extraValue, i }).get();
211+
} else {
212+
client().prepareIndex("test").setId(Integer.toString(i)).setSource(intField, i).get();
213+
}
214+
}
215+
ensureGreen();
216+
waitForRelocation();
217+
forceMerge();
218+
refresh();
219+
220+
// Range queries will match if ANY of the doc's values are within the range.
221+
// So if we exclude the range 0 to 20, we shouldn't see any of those documents returned,
222+
// even though they also have a value -1 which is not excluded.
223+
224+
int expectedHitCount = numDocs - numDocsWithExtraValue;
225+
assertHitCount(
226+
client().prepareSearch().setQuery(boolQuery().mustNot(rangeQuery(intField).lt(numDocsWithExtraValue).gte(0))).get(),
227+
expectedHitCount
228+
);
229+
assertHitCount(client().prepareSearch().setQuery(matchAllQuery()).get(), numDocs);
230+
}
231+
232+
private String padZeros(int value, int length) {
233+
// String.format() not allowed
234+
String ret = Integer.toString(value);
235+
if (ret.length() < length) {
236+
ret = "0".repeat(length - ret.length()) + ret;
237+
}
238+
return ret;
239+
}
240+
241+
private String getDate(int day, int hour) {
242+
return "2016-01-" + padZeros(day, 2) + "T" + padZeros(hour, 2) + ":00:00.000";
243+
}
244+
}

server/src/main/java/org/opensearch/index/query/BoolQueryBuilder.java

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,13 @@
3232

3333
package org.opensearch.index.query;
3434

35+
import org.apache.lucene.index.LeafReader;
36+
import org.apache.lucene.index.LeafReaderContext;
37+
import org.apache.lucene.index.PointValues;
3538
import org.apache.lucene.search.BooleanClause;
3639
import org.apache.lucene.search.BooleanClause.Occur;
3740
import org.apache.lucene.search.BooleanQuery;
41+
import org.apache.lucene.search.IndexSearcher;
3842
import org.apache.lucene.search.MatchAllDocsQuery;
3943
import org.apache.lucene.search.Query;
4044
import org.opensearch.common.lucene.search.Queries;
@@ -48,10 +52,13 @@
4852

4953
import java.io.IOException;
5054
import java.util.ArrayList;
55+
import java.util.HashMap;
56+
import java.util.HashSet;
5157
import java.util.List;
5258
import java.util.Map;
5359
import java.util.Objects;
5460
import java.util.Optional;
61+
import java.util.Set;
5562
import java.util.function.Consumer;
5663
import java.util.stream.Stream;
5764

@@ -393,9 +400,13 @@ protected QueryBuilder doRewrite(QueryRewriteContext queryRewriteContext) throws
393400
return any.get();
394401
}
395402

403+
changed |= rewriteMustNotRangeClausesToShould(newBuilder, queryRewriteContext);
404+
396405
if (changed) {
397406
newBuilder.adjustPureNegative = adjustPureNegative;
398-
newBuilder.minimumShouldMatch = minimumShouldMatch;
407+
if (minimumShouldMatch != null) {
408+
newBuilder.minimumShouldMatch = minimumShouldMatch;
409+
}
399410
newBuilder.boost(boost());
400411
newBuilder.queryName(queryName());
401412
return newBuilder;
@@ -460,4 +471,83 @@ public void visit(QueryBuilderVisitor visitor) {
460471

461472
}
462473

474+
private boolean rewriteMustNotRangeClausesToShould(BoolQueryBuilder newBuilder, QueryRewriteContext queryRewriteContext) {
475+
// If there is a range query on a given field in a must_not clause, it's more performant to execute it as
476+
// multiple should clauses representing everything outside the target range.
477+
478+
// First check if we can get the individual LeafContexts. If we can't, we can't proceed with the rewrite, since we can't confirm
479+
// every doc has exactly 1 value for this field.
480+
List<LeafReaderContext> leafReaderContexts = getLeafReaderContexts(queryRewriteContext);
481+
if (leafReaderContexts == null || leafReaderContexts.isEmpty()) {
482+
return false;
483+
}
484+
485+
boolean changed = false;
486+
// For now, only handle the case where there's exactly 1 range query for this field.
487+
Map<String, Integer> fieldCounts = new HashMap<>();
488+
Set<RangeQueryBuilder> rangeQueries = new HashSet<>();
489+
for (QueryBuilder clause : mustNotClauses) {
490+
if (clause instanceof RangeQueryBuilder rq) {
491+
fieldCounts.merge(rq.fieldName(), 1, Integer::sum);
492+
rangeQueries.add(rq);
493+
}
494+
}
495+
496+
for (RangeQueryBuilder rq : rangeQueries) {
497+
String fieldName = rq.fieldName();
498+
if (fieldCounts.getOrDefault(fieldName, 0) == 1) {
499+
// Check that all docs on this field have exactly 1 value, otherwise we can't perform this rewrite
500+
if (checkAllDocsHaveOneValue(leafReaderContexts, fieldName)) {
501+
List<RangeQueryBuilder> complement = rq.getComplement();
502+
if (complement != null) {
503+
BoolQueryBuilder nestedBoolQuery = new BoolQueryBuilder();
504+
nestedBoolQuery.minimumShouldMatch(1);
505+
for (RangeQueryBuilder complementComponent : complement) {
506+
nestedBoolQuery.should(complementComponent);
507+
}
508+
newBuilder.must(nestedBoolQuery);
509+
newBuilder.mustNotClauses.remove(rq);
510+
changed = true;
511+
}
512+
}
513+
}
514+
}
515+
516+
if (minimumShouldMatch == null && changed) {
517+
if ((!shouldClauses.isEmpty()) && mustClauses.isEmpty() && filterClauses.isEmpty()) {
518+
// If there were originally should clauses and no must/filter clauses, null minimumShouldMatch is set to a default of 1
519+
// within Lucene.
520+
// But if there was originally a must or filter clause, the default is 0.
521+
// If we added a must clause due to this rewrite, we should respect what the original default would have been.
522+
newBuilder.minimumShouldMatch(1);
523+
}
524+
}
525+
return changed;
526+
}
527+
528+
private List<LeafReaderContext> getLeafReaderContexts(QueryRewriteContext queryRewriteContext) {
529+
if (queryRewriteContext == null) return null;
530+
QueryShardContext shardContext = queryRewriteContext.convertToShardContext();
531+
if (shardContext == null) return null;
532+
IndexSearcher indexSearcher = shardContext.searcher();
533+
if (indexSearcher == null) return null;
534+
return indexSearcher.getIndexReader().leaves();
535+
}
536+
537+
private boolean checkAllDocsHaveOneValue(List<LeafReaderContext> contexts, String fieldName) {
538+
for (LeafReaderContext lrc : contexts) {
539+
PointValues values;
540+
try {
541+
LeafReader reader = lrc.reader();
542+
values = reader.getPointValues(fieldName);
543+
if (values == null || !(values.getDocCount() == reader.maxDoc() && values.getDocCount() == values.size())) {
544+
return false;
545+
}
546+
} catch (IOException e) {
547+
// If we can't get PointValues to check on the number of values per doc, assume the query is ineligible
548+
return false;
549+
}
550+
}
551+
return true;
552+
}
463553
}

0 commit comments

Comments
 (0)