Skip to content

Commit 5a75558

Browse files
branch-4.0: [fix](inverted index) Fix empty string MATCH on keyword index returning wrong results #60500 (#60516)
Cherry-picked from #60500 Co-authored-by: Jack <jiangkai@selectdb.com>
1 parent 98fbe2b commit 5a75558

File tree

5 files changed

+115
-15
lines changed

5 files changed

+115
-15
lines changed

be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -187,10 +187,10 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
187187
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
188188
const std::string& search_str, const std::map<std::string, std::string>& properties) {
189189
if (!should_analyzer(properties)) {
190+
// Keyword index: all strings (including empty) are valid tokens for exact match.
191+
// Empty string is a valid value in keyword index and should be matchable.
190192
std::vector<TermInfo> result;
191-
if (!search_str.empty()) {
192-
result.emplace_back(search_str);
193-
}
193+
result.emplace_back(search_str);
194194
return result;
195195
}
196196
InvertedIndexAnalyzerConfig config;

be/src/olap/rowset/segment_v2/inverted_index_reader.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -315,10 +315,9 @@ Status FullTextIndexReader::query(const IndexQueryContextPtr& context,
315315
} else {
316316
SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer);
317317
if (analyzer_ctx != nullptr && !analyzer_ctx->should_tokenize()) {
318-
// Don't add empty string as token - empty query should match nothing
319-
if (!search_str.empty()) {
320-
query_info.term_infos.emplace_back(search_str);
321-
}
318+
// Keyword index: all strings (including empty) are valid tokens for exact match.
319+
// Empty string is a valid value in keyword index and should be matchable.
320+
query_info.term_infos.emplace_back(search_str);
322321
} else if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) {
323322
// Use analyzer from query context for consistent behavior across all segments.
324323
// This ensures that the query uses the same analyzer settings (e.g., lowercase)

be/src/vec/functions/match.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -205,20 +205,17 @@ std::vector<TermInfo> FunctionMatchBase::analyse_query_str_token(
205205
// - PARSER_NONE: no tokenization (keyword/exact match)
206206
// - Other parsers: tokenize using the analyzer
207207
if (!analyzer_ctx->should_tokenize()) {
208-
// Keyword index or no tokenization needed
209-
// Don't add empty string as token - empty query should match nothing
210-
if (!match_query_str.empty()) {
211-
query_tokens.emplace_back(match_query_str);
212-
}
208+
// Keyword index: all strings (including empty) are valid tokens for exact match.
209+
// Empty string is a valid value in keyword index and should be matchable.
210+
query_tokens.emplace_back(match_query_str);
213211
return query_tokens;
214212
}
215213

216214
// Safety check: if analyzer is nullptr but tokenization is expected, fall back to no tokenization
217215
if (analyzer_ctx->analyzer == nullptr) {
218216
VLOG_DEBUG << "Analyzer is nullptr, falling back to no tokenization";
219-
if (!match_query_str.empty()) {
220-
query_tokens.emplace_back(match_query_str);
221-
}
217+
// For fallback case, also allow empty strings to be matched
218+
query_tokens.emplace_back(match_query_str);
222219
return query_tokens;
223220
}
224221

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !keyword_index_path --
3+
1
4+
3
5+
6+
-- !keyword_slow_path --
7+
1
8+
3
9+
10+
-- !english_index_path --
11+
0
12+
13+
-- !english_slow_path --
14+
0
15+
16+
-- !keyword_nonempty --
17+
2
18+
19+
-- !match_any_empty --
20+
1
21+
3
22+
23+
-- !match_all_empty --
24+
1
25+
3
26+
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
suite("test_empty_string_match", "p0") {
19+
def tableName = "test_empty_string_match"
20+
21+
sql "DROP TABLE IF EXISTS ${tableName}"
22+
sql """
23+
CREATE TABLE ${tableName} (
24+
id INT,
25+
keyword_col TEXT DEFAULT '',
26+
english_col TEXT DEFAULT '',
27+
INDEX keyword_idx(keyword_col) USING INVERTED COMMENT 'keyword index',
28+
INDEX english_idx(english_col) USING INVERTED PROPERTIES("parser" = "english") COMMENT 'english parser'
29+
) ENGINE=OLAP
30+
DUPLICATE KEY(id)
31+
DISTRIBUTED BY HASH(id) BUCKETS 1
32+
PROPERTIES("replication_allocation" = "tag.location.default: 1");
33+
"""
34+
35+
sql """
36+
INSERT INTO ${tableName} VALUES
37+
(1, '', 'hello world'),
38+
(2, 'test', ''),
39+
(3, '', ''),
40+
(4, 'data', 'some text');
41+
"""
42+
43+
sql "SET enable_common_expr_pushdown = true"
44+
45+
// Test 1: Empty string match on keyword index (index path)
46+
// Should match rows where keyword_col is empty string (rows 1 and 3)
47+
sql "SET enable_inverted_index_query = true"
48+
qt_keyword_index_path """SELECT id FROM ${tableName} WHERE keyword_col match '' ORDER BY id"""
49+
50+
// Test 2: Empty string match on keyword index (slow path)
51+
// Should also match rows where keyword_col is empty string
52+
sql "SET enable_inverted_index_query = false"
53+
sql "SET enable_match_without_inverted_index = true"
54+
qt_keyword_slow_path """SELECT id FROM ${tableName} WHERE keyword_col match '' ORDER BY id"""
55+
56+
// Test 3: Empty string match on tokenized index (index path)
57+
// Should return no rows because empty string tokenizes to nothing
58+
sql "SET enable_inverted_index_query = true"
59+
qt_english_index_path """SELECT count() FROM ${tableName} WHERE english_col match ''"""
60+
61+
// Test 4: Empty string match on tokenized index (slow path)
62+
// Should also return no rows
63+
sql "SET enable_inverted_index_query = false"
64+
qt_english_slow_path """SELECT count() FROM ${tableName} WHERE english_col match ''"""
65+
66+
// Test 5: Non-empty string match on keyword index should work as before
67+
sql "SET enable_inverted_index_query = true"
68+
qt_keyword_nonempty """SELECT id FROM ${tableName} WHERE keyword_col match 'test' ORDER BY id"""
69+
70+
// Test 6: Verify match_any with empty string on keyword index
71+
sql "SET enable_inverted_index_query = false"
72+
qt_match_any_empty """SELECT id FROM ${tableName} WHERE keyword_col match_any '' ORDER BY id"""
73+
74+
// Test 7: Verify match_all with empty string on keyword index
75+
qt_match_all_empty """SELECT id FROM ${tableName} WHERE keyword_col match_all '' ORDER BY id"""
76+
77+
sql "DROP TABLE IF EXISTS ${tableName}"
78+
}

0 commit comments

Comments
 (0)