Skip to content

Commit c26897d

Browse files
authored
[BugFix] Fix NOT IN including null/missing rows due to missing exists filter (#5165) (#5337)
When PredicateAnalyzer handles a SEARCH expression with complemented points (NOT IN) and nullAs=UNKNOWN, the generated DSL query now includes an exists filter to exclude null/missing documents. This aligns with SQL three-valued logic where NULL NOT IN (...) evaluates to UNKNOWN, not TRUE. Signed-off-by: Songkan Tang <songkant@amazon.com>
1 parent d48aee2 commit c26897d

4 files changed

Lines changed: 189 additions & 1 deletion

File tree

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.calcite.remote;
7+
8+
import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_BANK_WITH_NULL_VALUES;
9+
import static org.opensearch.sql.util.MatcherUtils.rows;
10+
import static org.opensearch.sql.util.MatcherUtils.verifyDataRowsInOrder;
11+
12+
import java.io.IOException;
13+
import org.json.JSONObject;
14+
import org.junit.Test;
15+
import org.opensearch.sql.ppl.PPLIntegTestCase;
16+
17+
/** Integration test for NOT IN excluding null/missing rows (issue #5165). */
18+
public class CalciteNotInNullFilterIT extends PPLIntegTestCase {
19+
20+
@Override
21+
public void init() throws Exception {
22+
super.init();
23+
enableCalcite();
24+
loadIndex(Index.BANK_WITH_NULL_VALUES);
25+
}
26+
27+
@Test
28+
public void testNotInExcludesNullRows() throws IOException {
29+
// age values: 32, 36, 28, 33, 36, null, 34
30+
// NOT IN (32, 28) should return 36, 33, 36, 34 — excluding the null row
31+
JSONObject result =
32+
executeQuery(
33+
String.format(
34+
"source=%s | where age NOT IN (32, 28) | fields age | sort age",
35+
TEST_INDEX_BANK_WITH_NULL_VALUES));
36+
verifyDataRowsInOrder(result, rows(33), rows(34), rows(36), rows(36));
37+
}
38+
39+
@Test
40+
public void testNotInExcludesNullAndMissingRows() throws IOException {
41+
// balance values: 39225, null, 32838, 4180, null, null, 48086
42+
// NOT IN (39225) should return 32838, 4180, 48086 — excluding null/missing rows
43+
JSONObject result =
44+
executeQuery(
45+
String.format(
46+
"source=%s | where balance NOT IN (39225) | fields balance | sort balance",
47+
TEST_INDEX_BANK_WITH_NULL_VALUES));
48+
verifyDataRowsInOrder(result, rows(4180), rows(32838), rows(48086));
49+
}
50+
51+
@Test
52+
public void testInWithNullRowsIsUnaffected() throws IOException {
53+
// IN should naturally exclude nulls (positive match never matches null)
54+
JSONObject result =
55+
executeQuery(
56+
String.format(
57+
"source=%s | where age IN (32, 28) | fields age | sort age",
58+
TEST_INDEX_BANK_WITH_NULL_VALUES));
59+
verifyDataRowsInOrder(result, rows(28), rows(32));
60+
}
61+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
setup:
2+
- do:
3+
query.settings:
4+
body:
5+
transient:
6+
plugins.calcite.enabled: true
7+
- do:
8+
indices.create:
9+
index: issue5165
10+
body:
11+
settings:
12+
number_of_shards: 1
13+
number_of_replicas: 0
14+
mappings:
15+
properties:
16+
int_field:
17+
type: integer
18+
- do:
19+
bulk:
20+
refresh: true
21+
body:
22+
- '{"index": {"_index": "issue5165", "_id": "1"}}'
23+
- '{"int_field": 42}'
24+
- '{"index": {"_index": "issue5165", "_id": "2"}}'
25+
- '{"int_field": -1}'
26+
- '{"index": {"_index": "issue5165", "_id": "3"}}'
27+
- '{"int_field": 0}'
28+
- '{"index": {"_index": "issue5165", "_id": "4"}}'
29+
- '{"int_field": 2147483647}'
30+
- '{"index": {"_index": "issue5165", "_id": "5"}}'
31+
- '{"int_field": null}'
32+
33+
---
34+
teardown:
35+
- do:
36+
indices.delete:
37+
index: issue5165
38+
ignore_unavailable: true
39+
- do:
40+
query.settings:
41+
body:
42+
transient:
43+
plugins.calcite.enabled: false
44+
45+
---
46+
"Issue 5165: NOT IN should exclude null/missing rows":
47+
- skip:
48+
features:
49+
- headers
50+
- allowed_warnings
51+
- do:
52+
allowed_warnings:
53+
- 'Loading the fielddata on the _id field is deprecated and will be removed in future versions. If you require sorting or aggregating on this field you should also include the id in the body of your documents, and map this field as a keyword field that has [doc_values] enabled'
54+
headers:
55+
Content-Type: 'application/json'
56+
ppl:
57+
body:
58+
query: source=issue5165 | where int_field NOT IN (42, -1, 0) | fields int_field
59+
60+
- match: { total: 1 }
61+
- length: { datarows: 1 }
62+
- match: { datarows: [ [ 2147483647 ] ] }

opensearch/src/main/java/org/opensearch/sql/opensearch/request/PredicateAnalyzer.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,14 @@ private QueryExpression binary(RexCall call) {
725725
CompoundQueryExpression.or(
726726
expression, QueryExpression.create(pair.getKey()).notExists());
727727
// e.g. where a = 1 or a = 2
728-
case UNKNOWN -> expression;
728+
// For NOT IN (complemented points), SQL three-valued logic dictates
729+
// NULL NOT IN (...) evaluates to UNKNOWN (not TRUE), so null rows
730+
// must be excluded via an exists filter.
731+
case UNKNOWN ->
732+
isSearchWithComplementedPoints(call)
733+
? CompoundQueryExpression.and(
734+
false, expression, QueryExpression.create(pair.getKey()).exists())
735+
: expression;
729736
};
730737
finalExpression.updateAnalyzedNodes(call);
731738
return finalExpression;

opensearch/src/test/java/org/opensearch/sql/opensearch/request/PredicateAnalyzerTest.java

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,4 +1245,62 @@ void search_complementedPointsWithNullAsFalse_generatesExistsAndNotInQuery()
12451245
""",
12461246
result.toString());
12471247
}
1248+
1249+
@Test
1250+
void search_complementedPointsWithNullAsUnknown_generatesExistsAndNotInQuery()
1251+
throws ExpressionNotAnalyzableException {
1252+
// Simulates: a NOT IN (12, 13)
1253+
// Calcite represents this as SEARCH($0, Sarg[...; NULL AS UNKNOWN]) with complemented points
1254+
// SQL three-valued logic: NULL NOT IN (...) evaluates to UNKNOWN (not TRUE),
1255+
// so null rows must be excluded.
1256+
Sarg<BigDecimal> sarg =
1257+
Sarg.of(
1258+
RexUnknownAs.UNKNOWN,
1259+
ImmutableRangeSet.<BigDecimal>builder()
1260+
.add(Range.lessThan(BigDecimal.valueOf(12)))
1261+
.add(Range.open(BigDecimal.valueOf(12), BigDecimal.valueOf(13)))
1262+
.add(Range.greaterThan(BigDecimal.valueOf(13)))
1263+
.build());
1264+
RexNode sargLiteral =
1265+
builder.makeSearchArgumentLiteral(sarg, typeFactory.createSqlType(SqlTypeName.DECIMAL));
1266+
RexNode call = builder.makeCall(SqlStdOperatorTable.SEARCH, field1, sargLiteral);
1267+
QueryBuilder result = PredicateAnalyzer.analyze(call, schema, fieldTypes);
1268+
1269+
assertInstanceOf(BoolQueryBuilder.class, result);
1270+
assertEquals(
1271+
"""
1272+
{
1273+
"bool" : {
1274+
"must" : [
1275+
{
1276+
"bool" : {
1277+
"must_not" : [
1278+
{
1279+
"terms" : {
1280+
"a" : [
1281+
12.0,
1282+
13.0
1283+
],
1284+
"boost" : 1.0
1285+
}
1286+
}
1287+
],
1288+
"adjust_pure_negative" : true,
1289+
"boost" : 1.0
1290+
}
1291+
},
1292+
{
1293+
"exists" : {
1294+
"field" : "a",
1295+
"boost" : 1.0
1296+
}
1297+
}
1298+
],
1299+
"adjust_pure_negative" : true,
1300+
"boost" : 1.0
1301+
}
1302+
}\
1303+
""",
1304+
result.toString());
1305+
}
12481306
}

0 commit comments

Comments
 (0)