From 742a6dfdca29bb5a0e764db3c359894dfc41d08c Mon Sep 17 00:00:00 2001 From: Gary Date: Wed, 11 Feb 2026 18:25:23 +0800 Subject: [PATCH 1/8] initial --- be/src/olap/tablet_meta.cpp | 18 ++ be/src/olap/tablet_schema.cpp | 9 + be/src/olap/tablet_schema.h | 3 + be/src/vec/common/variant_util.cpp | 4 + be/src/vec/common/variant_util.h | 20 ++ be/src/vec/json/json_parser.cpp | 27 ++- be/src/vec/json/json_parser.h | 7 + .../doris/catalog/VariantSkipPattern.java | 82 ++++++++ .../org/apache/doris/catalog/VariantType.java | 21 +- .../org/apache/doris/nereids/DorisLexer.g4 | 1 + .../org/apache/doris/nereids/DorisParser.g4 | 2 + .../java/org/apache/doris/catalog/Column.java | 18 ++ .../nereids/parser/LogicalPlanBuilder.java | 22 +- .../apache/doris/nereids/types/DataType.java | 6 +- .../nereids/types/VariantSkipPattern.java | 137 +++++++++++++ .../doris/nereids/types/VariantType.java | 30 ++- .../ExternalFileTableValuedFunction.java | 4 +- gensrc/proto/olap_file.proto | 7 + gensrc/thrift/Descriptors.thrift | 6 + .../predefine/test_schema_template_skip.out | 95 +++++++++ .../test_schema_template_skip.groovy | 188 ++++++++++++++++++ 21 files changed, 691 insertions(+), 16 deletions(-) create mode 100644 fe/fe-common/src/main/java/org/apache/doris/catalog/VariantSkipPattern.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantSkipPattern.java create mode 100644 regression-test/data/variant_p0/predefine/test_schema_template_skip.out create mode 100644 regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 8a4c9da9fd2009..29df5c169d1f52 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -562,6 +562,24 @@ void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tco if (tcolumn.__isset.variant_doc_hash_shard_count) { column->set_variant_doc_hash_shard_count(tcolumn.variant_doc_hash_shard_count); } + if (tcolumn.__isset.skip_patterns) { + for (const auto& tsp : tcolumn.skip_patterns) { + auto* sp = column->add_skip_patterns(); + if (tsp.__isset.pattern) { + sp->set_pattern(tsp.pattern); + } + if (tsp.__isset.pattern_type) { + switch (tsp.pattern_type) { + case TPatternType::MATCH_NAME: + sp->set_pattern_type(PatternTypePB::MATCH_NAME); + break; + case TPatternType::MATCH_NAME_GLOB: + sp->set_pattern_type(PatternTypePB::MATCH_NAME_GLOB); + break; + } + } + } + } } void TabletMeta::remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version) { diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index d79945f1f89cf5..ad73d983cb4d4f 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -680,6 +680,10 @@ void TabletColumn::init_from_pb(const ColumnPB& column) { if (column.has_pattern_type()) { _pattern_type = column.pattern_type(); } + for (const auto& sp : column.skip_patterns()) { + PatternTypePB pt = sp.has_pattern_type() ? sp.pattern_type() : PatternTypePB::MATCH_NAME_GLOB; + _variant.skip_patterns.emplace_back(sp.pattern(), pt); + } } TabletColumn TabletColumn::create_materialized_variant_column(const std::string& root, @@ -763,6 +767,11 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const { column->set_variant_enable_doc_mode(_variant.enable_doc_mode); column->set_variant_doc_materialization_min_rows(_variant.doc_materialization_min_rows); column->set_variant_doc_hash_shard_count(_variant.doc_hash_shard_count); + for (const auto& [pattern, pt] : _variant.skip_patterns) { + auto* sp = column->add_skip_patterns(); + sp->set_pattern(pattern); + sp->set_pattern_type(pt); + } } void TabletColumn::add_sub_column(TabletColumn& sub_column) { diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 8ed0ee239d1b40..dd75b488135613 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -80,6 +80,9 @@ class TabletColumn : public MetadataAdder { bool enable_doc_mode = false; int64_t doc_materialization_min_rows = 0; int32_t doc_hash_shard_count = 64; + + // skip patterns for variant column + std::vector> skip_patterns; }; TabletColumn(); diff --git a/be/src/vec/common/variant_util.cpp b/be/src/vec/common/variant_util.cpp index 069a64798d062a..6e901b27310b60 100644 --- a/be/src/vec/common/variant_util.cpp +++ b/be/src/vec/common/variant_util.cpp @@ -2046,6 +2046,10 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t return Status::InternalError("column is not variant type, column name: {}", column.name()); } + // set skip patterns if any + if (!column.variant_params().skip_patterns.empty()) { + configs[i].skip_patterns = &column.variant_params().skip_patterns; + } // if doc mode is not enabled, no need to parse to doc value column if (!column.variant_enable_doc_mode()) { configs[i].parse_to = ParseConfig::ParseTo::OnlySubcolumns; diff --git a/be/src/vec/common/variant_util.h b/be/src/vec/common/variant_util.h index a36179ac0fbf50..e3532265056704 100644 --- a/be/src/vec/common/variant_util.h +++ b/be/src/vec/common/variant_util.h @@ -71,6 +71,26 @@ Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern // Match a glob pattern against a path using RE2. bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path); +// Check if a dot-separated path should be skipped based on skip patterns. +// For MATCH_NAME_GLOB, uses glob matching; for MATCH_NAME, uses exact string comparison. +inline bool should_skip_path( + const std::vector>& skip_patterns, + const std::string& path) { + for (const auto& [pattern, pt] : skip_patterns) { + if (pt == PatternTypePB::MATCH_NAME) { + if (path == pattern) { + return true; + } + } else { + // MATCH_NAME_GLOB + if (glob_match_re2(pattern, path)) { + return true; + } + } + } + return false; +} + using PathToNoneNullValues = std::unordered_map; using PathToDataTypes = std::unordered_map, PathInData::Hash>; diff --git a/be/src/vec/json/json_parser.cpp b/be/src/vec/json/json_parser.cpp index eb4d6c5e2b5fee..59fb293b34a934 100644 --- a/be/src/vec/json/json_parser.cpp +++ b/be/src/vec/json/json_parser.cpp @@ -30,6 +30,7 @@ #include "common/cast_set.h" #include "common/config.h" #include "common/status.h" +#include "vec/common/variant_util.h" #include "vec/json/path_in_data.h" #include "vec/json/simd_json_parser.h" @@ -46,6 +47,7 @@ std::optional JSONDataParser::parse(const char* begin, ParseContext context; context.enable_flatten_nested = config.enable_flatten_nested; context.is_top_array = document.isArray(); + context.skip_patterns = config.skip_patterns; traverse(document, context); ParseResult result; result.values = std::move(context.values); @@ -100,9 +102,26 @@ void JSONDataParser::traverseObject(const JSONObject& object, ParseC fmt::format("Key length exceeds maximum allowed size of {} bytes.", max_key_length)); } - ctx.builder.append(key, false); - traverse(value, ctx); - ctx.builder.pop_back(); + // Check skip patterns: build the dot-separated path and test against patterns + if (ctx.skip_patterns != nullptr && !ctx.skip_patterns->empty()) { + std::string saved_path = ctx.current_path; + if (!ctx.current_path.empty()) { + ctx.current_path.push_back('.'); + } + ctx.current_path.append(key.data(), key.size()); + if (variant_util::should_skip_path(*ctx.skip_patterns, ctx.current_path)) { + ctx.current_path = std::move(saved_path); + continue; // skip this key and its entire subtree + } + ctx.builder.append(key, false); + traverse(value, ctx); + ctx.builder.pop_back(); + ctx.current_path = std::move(saved_path); + } else { + ctx.builder.append(key, false); + traverse(value, ctx); + ctx.builder.pop_back(); + } } } @@ -207,7 +226,7 @@ void JSONDataParser::traverseArrayElement(const Element& element, element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten; element_ctx.is_top_array = ctx.is_top_array; traverse(element, element_ctx); - auto& [_, paths, values, flatten_nested, __, is_top_array] = element_ctx; + auto& [_, paths, values, flatten_nested, __, is_top_array, ___, ____] = element_ctx; if (element_ctx.has_nested_in_flatten && is_top_array) { checkAmbiguousStructure(ctx, paths); diff --git a/be/src/vec/json/json_parser.h b/be/src/vec/json/json_parser.h index 69d900ee96db56..251ff4ca28446e 100644 --- a/be/src/vec/json/json_parser.h +++ b/be/src/vec/json/json_parser.h @@ -28,6 +28,7 @@ #include #include +#include "gen_cpp/olap_file.pb.h" #include "runtime/primitive_type.h" #include "util/jsonb_writer.h" #include "vec/columns/column.h" @@ -107,6 +108,8 @@ struct ParseConfig { BothSubcolumnsAndDocValueColumn = 2, }; ParseTo parse_to = ParseTo::OnlySubcolumns; + // skip patterns for variant column (pointer to avoid copy; nullptr means no skip) + const std::vector>* skip_patterns = nullptr; }; /// Result of parsing of a document. /// Contains all paths extracted from document @@ -131,6 +134,10 @@ class JSONDataParser { bool enable_flatten_nested = false; bool has_nested_in_flatten = false; bool is_top_array = false; + // skip patterns pointer (nullptr means no skip) + const std::vector>* skip_patterns = nullptr; + // incrementally maintained dot-separated path for skip matching + std::string current_path; }; using PathPartsWithArray = std::pair; using PathToArray = phmap::flat_hash_map; diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantSkipPattern.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantSkipPattern.java new file mode 100644 index 00000000000000..bf879067460965 --- /dev/null +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantSkipPattern.java @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.catalog; + +import org.apache.doris.thrift.TPatternType; + +import com.google.gson.annotations.SerializedName; + +import java.util.Objects; + +/** + * Catalog-layer representation of a variant SKIP pattern. + * Used for Gson persistence in FE metadata. + */ +public class VariantSkipPattern { + + @SerializedName(value = "p") + private final String pattern; + + @SerializedName(value = "pt") + private final TPatternType patternType; + + public VariantSkipPattern(String pattern, TPatternType patternType) { + this.pattern = Objects.requireNonNull(pattern, "pattern should not be null"); + this.patternType = Objects.requireNonNull(patternType, "patternType should not be null"); + } + + public String getPattern() { + return pattern; + } + + public TPatternType getPatternType() { + return patternType; + } + + public String toSql() { + StringBuilder sb = new StringBuilder(); + sb.append("SKIP "); + if (patternType == TPatternType.MATCH_NAME) { + sb.append("MATCH_NAME "); + } + sb.append("'").append(pattern).append("'"); + return sb.toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + VariantSkipPattern that = (VariantSkipPattern) o; + return Objects.equals(pattern, that.pattern) && patternType == that.patternType; + } + + @Override + public int hashCode() { + return Objects.hash(pattern, patternType); + } + + @Override + public String toString() { + return toSql(); + } +} diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java index da062d21489f66..33c940075d0ffd 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java @@ -40,6 +40,9 @@ public class VariantType extends ScalarType { @SerializedName(value = "fields") private final ArrayList predefinedFields; + @SerializedName(value = "skipPatterns") + private final ArrayList skipPatterns; + @SerializedName(value = "variantMaxSubcolumnsCount") private final int variantMaxSubcolumnsCount; @@ -66,6 +69,7 @@ public class VariantType extends ScalarType { public VariantType() { super(PrimitiveType.VARIANT); this.predefinedFields = Lists.newArrayList(); + this.skipPatterns = Lists.newArrayList(); this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; @@ -79,6 +83,7 @@ public VariantType(ArrayList fields) { super(PrimitiveType.VARIANT); Preconditions.checkNotNull(fields); this.predefinedFields = fields; + this.skipPatterns = Lists.newArrayList(); for (VariantField predefinedField : this.predefinedFields) { fieldMap.put(predefinedField.getPattern(), predefinedField); } @@ -94,6 +99,7 @@ public VariantType(ArrayList fields) { public VariantType(Map properties) { super(PrimitiveType.VARIANT); this.predefinedFields = Lists.newArrayList(); + this.skipPatterns = Lists.newArrayList(); this.properties = properties; this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; @@ -108,6 +114,7 @@ public VariantType(ArrayList fields, Map propertie super(PrimitiveType.VARIANT); Preconditions.checkNotNull(fields); this.predefinedFields = fields; + this.skipPatterns = Lists.newArrayList(); for (VariantField predefinedField : this.predefinedFields) { fieldMap.put(predefinedField.getPattern(), predefinedField); } @@ -121,7 +128,8 @@ public VariantType(ArrayList fields, Map propertie this.variantDocShardCount = 64; } - public VariantType(ArrayList fields, int variantMaxSubcolumnsCount, + public VariantType(ArrayList fields, ArrayList skipPatterns, + int variantMaxSubcolumnsCount, boolean enableTypedPathsToSparse, int variantMaxSparseColumnStatisticsSize, int variantSparseHashShardCount, @@ -131,6 +139,7 @@ public VariantType(ArrayList fields, int variantMaxSubcolumnsCount super(PrimitiveType.VARIANT); Preconditions.checkNotNull(fields); this.predefinedFields = fields; + this.skipPatterns = skipPatterns != null ? skipPatterns : Lists.newArrayList(); for (VariantField predefinedField : this.predefinedFields) { fieldMap.put(predefinedField.getPattern(), predefinedField); } @@ -148,6 +157,11 @@ public String toSql(int depth) { StringBuilder sb = new StringBuilder(); sb.append("variant"); sb.append("<"); + if (!skipPatterns.isEmpty()) { + sb.append(skipPatterns.stream() + .map(VariantSkipPattern::toSql).collect(Collectors.joining(","))); + sb.append(","); + } if (!predefinedFields.isEmpty()) { sb.append(predefinedFields.stream() .map(variantField -> variantField.toSql(depth)).collect(Collectors.joining(","))); @@ -185,6 +199,10 @@ public ArrayList getPredefinedFields() { return predefinedFields; } + public ArrayList getSkipPatterns() { + return skipPatterns; + } + @Override public void toThrift(TTypeDesc container) { super.toThrift(container); @@ -215,6 +233,7 @@ public boolean equals(Object other) { } VariantType otherVariantType = (VariantType) other; return Objects.equals(otherVariantType.getPredefinedFields(), predefinedFields) + && Objects.equals(otherVariantType.getSkipPatterns(), skipPatterns) && variantMaxSubcolumnsCount == otherVariantType.variantMaxSubcolumnsCount && enableTypedPathsToSparse == otherVariantType.enableTypedPathsToSparse && enableVariantDocMode == otherVariantType.enableVariantDocMode diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index 12a73beb2c5559..33106fa3406049 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -517,6 +517,7 @@ SHAPE: 'SHAPE'; SHOW: 'SHOW'; SIGNED: 'SIGNED'; SKEW: 'SKEW'; +SKIP_: 'SKIP'; SMALLINT: 'SMALLINT'; SNAPSHOT: 'SNAPSHOT'; SNAPSHOTS: 'SNAPSHOTS'; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index be2c75019006a4..076fec93af498d 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -1874,6 +1874,7 @@ variantSubColTypeList ; variantSubColType : variantSubColMatchType? STRING_LITERAL COLON dataType commentSpec? + | SKIP_ variantSubColMatchType? STRING_LITERAL ; variantSubColMatchType : (MATCH_NAME | MATCH_NAME_GLOB) @@ -2247,6 +2248,7 @@ nonReserved | SESSION_USER | SHAPE | SKEW + | SKIP_ | SNAPSHOT | SNAPSHOTS | SONAME diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java index 457259856cfb9f..20ab041a296706 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java @@ -33,6 +33,7 @@ import org.apache.doris.thrift.TColumnType; import org.apache.doris.thrift.TPatternType; import org.apache.doris.thrift.TPrimitiveType; +import org.apache.doris.thrift.TSkipPattern; import com.google.common.base.Strings; import com.google.common.collect.Lists; @@ -659,6 +660,12 @@ public TColumn toThrift() { tColumn.setVariantEnableDocMode(this.getVariantEnableDocMode()); tColumn.setVariantDocMaterializationMinRows(this.getvariantDocMaterializationMinRows()); tColumn.setVariantDocHashShardCount(this.getVariantDocShardCount()); + tColumn.setSkipPatterns(this.getVariantSkipPatterns().stream().map(sp -> { + TSkipPattern tsp = new TSkipPattern(); + tsp.setPattern(sp.getPattern()); + tsp.setPatternType(sp.getPatternType()); + return tsp; + }).collect(java.util.stream.Collectors.toList())); // ATTN: // Currently, this `toThrift()` method is only used from CreateReplicaTask. // And CreateReplicaTask does not need `defineExpr` field. @@ -886,6 +893,13 @@ public OlapFile.ColumnPB toPb(Set bfColumns, List indexes) throws builder.setVariantEnableDocMode(this.getVariantEnableDocMode()); builder.setVariantDocMaterializationMinRows(this.getvariantDocMaterializationMinRows()); builder.setVariantDocHashShardCount(this.getVariantDocShardCount()); + builder.addAllSkipPatterns(this.getVariantSkipPatterns().stream().map(sp -> + OlapFile.SkipPatternPB.newBuilder() + .setPattern(sp.getPattern()) + .setPatternType(sp.getPatternType() == TPatternType.MATCH_NAME + ? PatternTypePB.MATCH_NAME : PatternTypePB.MATCH_NAME_GLOB) + .build() + ).collect(java.util.stream.Collectors.toList())); // variant may contain predefined structured fields addChildren(builder); } @@ -1336,6 +1350,10 @@ public int getVariantDocShardCount() { return type.isVariantType() ? ((ScalarType) type).getVariantDocShardCount() : 128; } + public ArrayList getVariantSkipPatterns() { + return type.isVariantType() ? ((VariantType) type).getSkipPatterns() : Lists.newArrayList(); + } + public void setFieldPatternType(TPatternType type) { fieldPatternType = type; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 82314c719a0dbc..529c6f14ce05b0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -1065,6 +1065,7 @@ import org.apache.doris.nereids.types.StructType; import org.apache.doris.nereids.types.VarcharType; import org.apache.doris.nereids.types.VariantField; +import org.apache.doris.nereids.types.VariantSkipPattern; import org.apache.doris.nereids.types.VariantType; import org.apache.doris.nereids.types.coercion.CharacterType; import org.apache.doris.nereids.util.ExpressionUtils; @@ -5100,8 +5101,23 @@ public DataType visitVariantPredefinedFields(VariantPredefinedFieldsContext ctx) "Unsupported variant definition: " + variantDef.getText()); VariantContext variantCtx = (VariantContext) variantDef; - List fields = variantCtx.variantSubColTypeList() != null - ? visitVariantSubColTypeList(variantCtx.variantSubColTypeList()) : Lists.newArrayList(); + List fields = Lists.newArrayList(); + List skipPatterns = Lists.newArrayList(); + if (variantCtx.variantSubColTypeList() != null) { + for (VariantSubColTypeContext subCtx : variantCtx.variantSubColTypeList().variantSubColType()) { + if (subCtx.SKIP_() != null) { + String skipPattern = subCtx.STRING_LITERAL().getText(); + skipPattern = skipPattern.substring(1, skipPattern.length() - 1); + String matchType = subCtx.variantSubColMatchType() != null + ? subCtx.variantSubColMatchType().getText() : null; + skipPatterns.add(matchType != null + ? new VariantSkipPattern(skipPattern, matchType) + : new VariantSkipPattern(skipPattern)); + } else { + fields.add(visitVariantSubColType(subCtx)); + } + } + } Map properties = variantCtx.properties != null ? Maps.newHashMap(visitPropertyClause(variantCtx.properties)) : Maps.newHashMap(); @@ -5184,7 +5200,7 @@ public DataType visitVariantPredefinedFields(VariantPredefinedFieldsContext ctx) + " and " + PropertyAnalyzer.PROPERTIES_VARIANT_DOC_HASH_SHARD_COUNT); } - return new VariantType(fields, variantMaxSubcolumnsCount, enableTypedPathsToSparse, + return new VariantType(fields, skipPatterns, variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocHashShardCount); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java index 911dc2e4e2cd51..3d1283b6199275 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java @@ -485,7 +485,11 @@ public static DataType fromCatalogType(Type type) { .map(cf -> new VariantField(cf.getPattern(), fromCatalogType(cf.getType()), cf.getComment() == null ? "" : cf.getComment(), cf.getPatternType().toString())) .collect(ImmutableList.toImmutableList()); - return new VariantType(variantFields, + List variantSkipPatterns = ((org.apache.doris.catalog.VariantType) type) + .getSkipPatterns().stream() + .map(sp -> new VariantSkipPattern(sp.getPattern(), sp.getPatternType().name())) + .collect(ImmutableList.toImmutableList()); + return new VariantType(variantFields, variantSkipPatterns, ((org.apache.doris.catalog.VariantType) type).getVariantMaxSubcolumnsCount(), ((org.apache.doris.catalog.VariantType) type).getEnableTypedPathsToSparse(), ((org.apache.doris.catalog.VariantType) type).getVariantMaxSparseColumnStatisticsSize(), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantSkipPattern.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantSkipPattern.java new file mode 100644 index 00000000000000..3dd6b2bb7f0217 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantSkipPattern.java @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.types; + +import org.apache.doris.common.GlobRegexUtil; +import org.apache.doris.thrift.TPatternType; + +import com.google.re2j.Pattern; +import com.google.re2j.PatternSyntaxException; + +import java.util.Objects; + +/** + * A skip pattern inside a VariantType. + * Specifies field paths that should be irreversibly pruned during data ingestion. + */ +public class VariantSkipPattern { + private final String pattern; + private final TPatternType patternType; + + /** + * VariantSkipPattern Constructor with default MATCH_NAME_GLOB pattern type. + */ + public VariantSkipPattern(String pattern) { + this(pattern, TPatternType.MATCH_NAME_GLOB.name()); + } + + /** + * VariantSkipPattern Constructor. + * Validates glob patterns at DDL time — invalid globs are rejected immediately. + * + * @param pattern the glob or exact pattern string + * @param patternType "MATCH_NAME" for exact match, otherwise MATCH_NAME_GLOB + */ + public VariantSkipPattern(String pattern, String patternType) { + this.pattern = Objects.requireNonNull(pattern, "pattern should not be null"); + TPatternType type; + if (TPatternType.MATCH_NAME.name().equalsIgnoreCase(patternType)) { + type = TPatternType.MATCH_NAME; + } else { + type = TPatternType.MATCH_NAME_GLOB; + } + this.patternType = type; + // DDL-time validation: compile glob to catch syntax errors early + if (this.patternType == TPatternType.MATCH_NAME_GLOB) { + try { + GlobRegexUtil.getOrCompilePattern(this.pattern); + } catch (PatternSyntaxException | IllegalArgumentException e) { + throw new IllegalArgumentException( + "Invalid glob pattern for SKIP: '" + this.pattern + "': " + e.getMessage(), e); + } + } + } + + public String getPattern() { + return pattern; + } + + public TPatternType getPatternType() { + return patternType; + } + + /** + * Check if the given field path matches this skip pattern. + * Note: This method is currently unused in FE. The actual skip pattern matching + * is performed in BE's JSON parser (should_skip_path) during data ingestion. + * Kept here for potential future FE-side validation or testing use. + */ + public boolean matches(String fieldPath) { + if (patternType == TPatternType.MATCH_NAME) { + return pattern.equals(fieldPath); + } + try { + Pattern compiled = GlobRegexUtil.getOrCompilePattern(pattern); + return compiled.matcher(fieldPath).matches(); + } catch (PatternSyntaxException | IllegalArgumentException e) { + return false; + } + } + + /** + * Convert to SQL string representation. + */ + public String toSql() { + StringBuilder sb = new StringBuilder(); + sb.append("SKIP "); + if (patternType == TPatternType.MATCH_NAME) { + sb.append("MATCH_NAME "); + } + sb.append("'").append(pattern).append("'"); + return sb.toString(); + } + + /** + * Convert to Catalog layer type. + */ + public org.apache.doris.catalog.VariantSkipPattern toCatalogType() { + return new org.apache.doris.catalog.VariantSkipPattern(pattern, patternType); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + VariantSkipPattern that = (VariantSkipPattern) o; + return Objects.equals(pattern, that.pattern) && patternType == that.patternType; + } + + @Override + public int hashCode() { + return Objects.hash(pattern, patternType); + } + + @Override + public String toString() { + return toSql(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java index af25e1f9061f2f..4f005df31f3605 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java @@ -50,6 +50,7 @@ public class VariantType extends PrimitiveType { private final int variantMaxSparseColumnStatisticsSize; private final List predefinedFields; + private final List skipPatterns; private final int variantSparseHashShardCount; private final boolean enableVariantDocMode; @@ -64,6 +65,7 @@ public class VariantType extends PrimitiveType { public VariantType(int variantMaxSubcolumnsCount) { this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; this.predefinedFields = Lists.newArrayList(); + this.skipPatterns = Lists.newArrayList(); this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; this.variantSparseHashShardCount = 0; @@ -77,6 +79,7 @@ public VariantType(int variantMaxSubcolumnsCount) { */ public VariantType(List fields) { this.predefinedFields = ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be null")); + this.skipPatterns = Lists.newArrayList(); this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; @@ -97,11 +100,14 @@ public VariantType(List fields) { * @param enableVariantDocMode whether to enable variant doc snapshot writing mode * @param variantDocMaterializationMinRows minimum rows to generate doc snapshot columns */ - public VariantType(List fields, int variantMaxSubcolumnsCount, + public VariantType(List fields, List skipPatterns, + int variantMaxSubcolumnsCount, boolean enableTypedPathsToSparse, int variantMaxSparseColumnStatisticsSize, int variantSparseHashShardCount, boolean enableVariantDocMode, long variantDocMaterializationMinRows, int variantDocShardCount) { this.predefinedFields = ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be null")); + this.skipPatterns = ImmutableList.copyOf( + Objects.requireNonNull(skipPatterns, "skipPatterns should not be null")); this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; this.enableTypedPathsToSparse = enableTypedPathsToSparse; this.variantMaxSparseColumnStatisticsSize = variantMaxSparseColumnStatisticsSize; @@ -114,7 +120,8 @@ public VariantType(List fields, int variantMaxSubcolumnsCount, @Override public DataType conversion() { return new VariantType(predefinedFields.stream().map(VariantField::conversion) - .collect(Collectors.toList()), variantMaxSubcolumnsCount, enableTypedPathsToSparse, + .collect(Collectors.toList()), skipPatterns, + variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocShardCount); @@ -124,7 +131,11 @@ public DataType conversion() { public Type toCatalogDataType() { org.apache.doris.catalog.VariantType type = new org.apache.doris.catalog.VariantType(predefinedFields.stream() .map(VariantField::toCatalogDataType) - .collect(Collectors.toCollection(ArrayList::new)), variantMaxSubcolumnsCount, enableTypedPathsToSparse, + .collect(Collectors.toCollection(ArrayList::new)), + skipPatterns.stream() + .map(VariantSkipPattern::toCatalogType) + .collect(Collectors.toCollection(ArrayList::new)), + variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocShardCount); return type; @@ -140,6 +151,10 @@ public String toSql() { StringBuilder sb = new StringBuilder(); sb.append("variant"); sb.append("<"); + if (!skipPatterns.isEmpty()) { + sb.append(skipPatterns.stream().map(VariantSkipPattern::toSql).collect(Collectors.joining(","))); + sb.append(","); + } if (!predefinedFields.isEmpty()) { sb.append(predefinedFields.stream().map(VariantField::toSql).collect(Collectors.joining(","))); sb.append(","); @@ -187,7 +202,8 @@ public boolean equals(Object o) { && this.enableTypedPathsToSparse == other.enableTypedPathsToSparse && this.enableVariantDocMode == other.enableVariantDocMode && this.variantDocMaterializationMinRows == other.variantDocMaterializationMinRows - && Objects.equals(predefinedFields, other.predefinedFields); + && Objects.equals(predefinedFields, other.predefinedFields) + && Objects.equals(skipPatterns, other.skipPatterns); } @Override @@ -216,7 +232,7 @@ public int hashCode() { return Objects.hash(super.hashCode(), variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocShardCount, - predefinedFields); + predefinedFields, skipPatterns); } @Override @@ -233,6 +249,10 @@ public List getPredefinedFields() { return predefinedFields; } + public List getSkipPatterns() { + return skipPatterns; + } + /** * Find the first matching VariantField for the given field name. * The matching is done in definition order, so the first matching pattern wins. diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index 5470c83222fa66..d32b50f9561a76 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -396,12 +396,12 @@ private Pair getColumnType(List typeNodes, int start) int maxSubcolumns = typeNode.getVariantMaxSubcolumnsCount(); // Currently no predefined fields are carried in PTypeNode for VARIANT, so use empty list and default // values for other properties. - type = new VariantType(new ArrayList<>(), maxSubcolumns, + type = new VariantType(new ArrayList<>(), new ArrayList<>(), maxSubcolumns, /*enableTypedPathsToSparse*/ false, /*variantMaxSparseColumnStatisticsSize*/ 10000, /*variantSparseHashShardCount*/ 0, /*variantEnableDocMode*/ false, - /*variantDocMaterializationMinRows*/ 0, + /*variantDocMaterializationMinRows*/ 0L, /*variantDocShardCount*/ 0); parsedNodes = 1; } else { diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index ff54f54aed7242..f599ff64fc0593 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -348,6 +348,11 @@ enum PatternTypePB { MATCH_NAME_GLOB = 2; } +message SkipPatternPB { + optional string pattern = 1; + optional PatternTypePB pattern_type = 2; +} + message ColumnPB { required int32 unique_id = 1; // ColumnMessage.unique_id optional string name = 2; // ColumnMessage.name @@ -390,6 +395,8 @@ message ColumnPB { optional int64 variant_doc_materialization_min_rows = 32; // Number of buckets used to store doc map in variant doc mode. optional int32 variant_doc_hash_shard_count = 33 [default = 64]; + // skip patterns for variant column + repeated SkipPatternPB skip_patterns = 34; } // Dictionary of Schema info, to reduce TabletSchemaCloudPB fdb kv size diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift index 11125c006493a4..eadcbdf77325d0 100644 --- a/gensrc/thrift/Descriptors.thrift +++ b/gensrc/thrift/Descriptors.thrift @@ -70,6 +70,11 @@ struct TColumnAccessPath { 3: optional TMetaAccessPath meta_access_path } +struct TSkipPattern { + 1: optional string pattern + 2: optional TPatternType pattern_type +} + struct TColumn { 1: required string column_name 2: required Types.TColumnType column_type @@ -99,6 +104,7 @@ struct TColumn { 26: optional bool variant_enable_doc_mode 27: optional i64 variant_doc_materialization_min_rows 28: optional i32 variant_doc_hash_shard_count + 29: optional list skip_patterns } struct TSlotDescriptor { diff --git a/regression-test/data/variant_p0/predefine/test_schema_template_skip.out b/regression-test/data/variant_p0/predefine/test_schema_template_skip.out new file mode 100644 index 00000000000000..ec191be3265da6 --- /dev/null +++ b/regression-test/data/variant_p0/predefine/test_schema_template_skip.out @@ -0,0 +1,95 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !skip_basic_glob_1 -- +1 visible +2 \N + +-- !skip_basic_glob_2 -- +1 \N +2 \N + +-- !skip_basic_glob_3 -- +1 \N +2 \N + +-- !skip_basic_glob_4 -- +1 \N +2 yes + +-- !skip_match_name_1 -- +1 \N + +-- !skip_match_name_2 -- +1 visible + +-- !skip_match_name_3 -- +1 open + +-- !skip_nested_1 -- +1 \N + +-- !skip_nested_2 -- +1 2 + +-- !skip_nested_3 -- +1 10 + +-- !skip_priority_1 -- +1 \N + +-- !skip_priority_2 -- +1 val + +-- !skip_glob_cross_1 -- +1 \N + +-- !skip_glob_cross_2 -- +1 2 + +-- !skip_glob_cross_3 -- +1 3 + +-- !skip_multi_1 -- +1 \N + +-- !skip_multi_2 -- +1 \N + +-- !skip_multi_3 -- +1 \N + +-- !skip_multi_4 -- +1 visible + +-- !skip_whole_col -- +1 {"normal_field":"visible"} +2 {"keep_me":"yes"} + +-- !skip_coexist_1 -- +1 \N + +-- !skip_coexist_2 -- +1 100 + +-- !skip_coexist_3 -- +1 val + +-- !skip_bulk_1 -- +0 + +-- !skip_bulk_2 -- +0 + +-- !skip_bulk_3 -- +0 + +-- !skip_bulk_4 -- +100 + +-- !skip_bulk_5 -- +100 + +-- !skip_bulk_6 -- +1 user_1 10 +50 user_50 500 +100 user_100 1000 + diff --git a/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy b/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy new file mode 100644 index 00000000000000..f30158992b4bef --- /dev/null +++ b/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_schema_template_skip", "p0") { + sql """ set describe_extend_variant_column = true """ + sql """ set default_variant_enable_typed_paths_to_sparse = false """ + sql """ set default_variant_enable_doc_mode = false """ + + // Test 1: Basic SKIP glob + def tableName1 = "test_skip_basic_glob" + sql "DROP TABLE IF EXISTS ${tableName1}" + sql """CREATE TABLE ${tableName1} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName1} values(1, '{"debug_info":"secret","debug_trace":"trace_val","normal_field":"visible"}')""" + sql """insert into ${tableName1} values(2, '{"debug_level":5,"keep_me":"yes"}')""" + + qt_skip_basic_glob_1 """ SELECT id, data['normal_field'] FROM ${tableName1} ORDER BY id """ + qt_skip_basic_glob_2 """ SELECT id, data['debug_info'] FROM ${tableName1} ORDER BY id """ + qt_skip_basic_glob_3 """ SELECT id, data['debug_trace'] FROM ${tableName1} ORDER BY id """ + qt_skip_basic_glob_4 """ SELECT id, data['keep_me'] FROM ${tableName1} ORDER BY id """ + + // Test 2: SKIP MATCH_NAME exact match + def tableName2 = "test_skip_match_name" + sql "DROP TABLE IF EXISTS ${tableName2}" + sql """CREATE TABLE ${tableName2} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName2} values(1, '{"secret":"hidden","secret_key":"visible","public":"open"}')""" + + qt_skip_match_name_1 """ SELECT id, data['secret'] FROM ${tableName2} ORDER BY id """ + qt_skip_match_name_2 """ SELECT id, data['secret_key'] FROM ${tableName2} ORDER BY id """ + qt_skip_match_name_3 """ SELECT id, data['public'] FROM ${tableName2} ORDER BY id """ + + // Test 3: Nested path SKIP + def tableName3 = "test_skip_nested_path" + sql "DROP TABLE IF EXISTS ${tableName3}" + sql """CREATE TABLE ${tableName3} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName3} values(1, '{"a":{"b":{"temp_1":1,"temp_12":10,"keep":2}}}')""" + + qt_skip_nested_1 """ SELECT id, data['a']['b']['temp_1'] FROM ${tableName3} ORDER BY id """ + qt_skip_nested_2 """ SELECT id, data['a']['b']['keep'] FROM ${tableName3} ORDER BY id """ + // temp_12 has 2 chars after temp_, so '?' should NOT match it + qt_skip_nested_3 """ SELECT id, data['a']['b']['temp_12'] FROM ${tableName3} ORDER BY id """ + + // Test 4: SKIP takes priority over typed pattern + def tableName4 = "test_skip_priority" + sql "DROP TABLE IF EXISTS ${tableName4}" + sql """CREATE TABLE ${tableName4} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName4} values(1, '{"num_a":100,"other":"val"}')""" + + qt_skip_priority_1 """ SELECT id, data['num_a'] FROM ${tableName4} ORDER BY id """ + qt_skip_priority_2 """ SELECT id, data['other'] FROM ${tableName4} ORDER BY id """ + + // Test 5: Invalid glob DDL rejection + test { + sql """CREATE TABLE test_skip_invalid_glob ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + exception "Invalid glob pattern" + } + + // Test 6: Glob cross-level matching — pattern spans nested path + def tableName6 = "test_skip_glob_cross_level" + sql "DROP TABLE IF EXISTS ${tableName6}" + sql """CREATE TABLE ${tableName6} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName6} values(1, '{"a":{"debug_x":1,"keep":2},"debug_y":3}')""" + + qt_skip_glob_cross_1 """ SELECT id, data['a']['debug_x'] FROM ${tableName6} ORDER BY id """ + qt_skip_glob_cross_2 """ SELECT id, data['a']['keep'] FROM ${tableName6} ORDER BY id """ + qt_skip_glob_cross_3 """ SELECT id, data['debug_y'] FROM ${tableName6} ORDER BY id """ + + // Test 7: Multiple SKIP patterns + def tableName7 = "test_skip_multiple" + sql "DROP TABLE IF EXISTS ${tableName7}" + sql """CREATE TABLE ${tableName7} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName7} values(1, '{"temp_data":"t","internal_id":1,"password":"secret","name":"visible"}')""" + + qt_skip_multi_1 """ SELECT id, data['temp_data'] FROM ${tableName7} ORDER BY id """ + qt_skip_multi_2 """ SELECT id, data['internal_id'] FROM ${tableName7} ORDER BY id """ + qt_skip_multi_3 """ SELECT id, data['password'] FROM ${tableName7} ORDER BY id """ + qt_skip_multi_4 """ SELECT id, data['name'] FROM ${tableName7} ORDER BY id """ + + // Test 8: SELECT whole column — skipped fields should not appear in JSON output + qt_skip_whole_col """ SELECT id, data FROM ${tableName1} ORDER BY id """ + + // Test 9: SKIP with non-conflicting typed pattern coexistence + def tableName9 = "test_skip_coexist" + sql "DROP TABLE IF EXISTS ${tableName9}" + sql """CREATE TABLE ${tableName9} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName9} values(1, '{"debug_x":1,"num_a":100,"other":"val"}')""" + + qt_skip_coexist_1 """ SELECT id, data['debug_x'] FROM ${tableName9} ORDER BY id """ + qt_skip_coexist_2 """ SELECT id, data['num_a'] FROM ${tableName9} ORDER BY id """ + qt_skip_coexist_3 """ SELECT id, data['other'] FROM ${tableName9} ORDER BY id """ + + // Test 10: Empty pattern rejection + test { + sql """CREATE TABLE test_skip_empty_pattern ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + exception "pattern" + } + + // Test 11: Bulk data — verify SKIP works correctly with larger dataset + def tableName11 = "test_skip_bulk" + sql "DROP TABLE IF EXISTS ${tableName11}" + sql """CREATE TABLE ${tableName11} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + // Insert 100 rows with mixed fields + for (int i = 1; i <= 100; i++) { + sql """insert into ${tableName11} values(${i}, + '{"debug_id":${i},"debug_msg":"msg_${i}","internal":"secret_${i}","name":"user_${i}","value":${i * 10}}')""" + } + + // Skipped fields should all be NULL + qt_skip_bulk_1 """ SELECT count(*) FROM ${tableName11} WHERE data['debug_id'] IS NOT NULL """ + qt_skip_bulk_2 """ SELECT count(*) FROM ${tableName11} WHERE data['debug_msg'] IS NOT NULL """ + qt_skip_bulk_3 """ SELECT count(*) FROM ${tableName11} WHERE data['internal'] IS NOT NULL """ + // Non-skipped fields should all be present + qt_skip_bulk_4 """ SELECT count(*) FROM ${tableName11} WHERE data['name'] IS NOT NULL """ + qt_skip_bulk_5 """ SELECT count(*) FROM ${tableName11} WHERE data['value'] IS NOT NULL """ + // Spot check specific rows + qt_skip_bulk_6 """ SELECT id, data['name'], data['value'] FROM ${tableName11} WHERE id IN (1, 50, 100) ORDER BY id """ +} From c2edcf25b417336c6739507d3d6fcef7c839ae54 Mon Sep 17 00:00:00 2001 From: Gary Date: Thu, 12 Feb 2026 03:08:58 +0800 Subject: [PATCH 2/8] skip pattern optimization --- be/src/olap/tablet_meta.cpp | 48 ++-- be/src/olap/tablet_schema.cpp | 9 +- be/src/vec/common/variant_util.cpp | 104 ++++++++ be/src/vec/common/variant_util.h | 27 +- be/src/vec/json/json_parser.cpp | 66 ++++- be/src/vec/json/json_parser.h | 20 ++ .../rowset/segment_v2/variant_util_test.cpp | 247 ++++++++++++++++++ .../nereids/parser/LogicalPlanBuilder.java | 7 - .../test_schema_template_skip.groovy | 13 +- 9 files changed, 485 insertions(+), 56 deletions(-) diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 29df5c169d1f52..ec3f9ca54bb9c5 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -82,6 +82,21 @@ bvar::Window> g_contains_agg_with_cache_if_eligible_full_h "g_contains_agg_with_cache_if_eligible_full_hit_1m", &g_contains_agg_with_cache_if_eligible_full_hit, 60); +namespace { + +inline PatternTypePB to_pattern_type_pb(TPatternType::type pattern_type) { + switch (pattern_type) { + case TPatternType::MATCH_NAME: + return PatternTypePB::MATCH_NAME; + case TPatternType::MATCH_NAME_GLOB: + return PatternTypePB::MATCH_NAME_GLOB; + default: + return PatternTypePB::MATCH_NAME_GLOB; + } +} + +} // namespace + TabletMetaSharedPtr TabletMeta::create( const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id, uint32_t next_unique_id, @@ -533,13 +548,7 @@ void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tco column->set_variant_max_subcolumns_count(tcolumn.column_type.variant_max_subcolumns_count); } if (tcolumn.__isset.pattern_type) { - switch (tcolumn.pattern_type) { - case TPatternType::MATCH_NAME: - column->set_pattern_type(PatternTypePB::MATCH_NAME); - break; - case TPatternType::MATCH_NAME_GLOB: - column->set_pattern_type(PatternTypePB::MATCH_NAME_GLOB); - } + column->set_pattern_type(to_pattern_type_pb(tcolumn.pattern_type)); } if (tcolumn.__isset.variant_enable_typed_paths_to_sparse) { column->set_variant_enable_typed_paths_to_sparse( @@ -562,22 +571,19 @@ void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tco if (tcolumn.__isset.variant_doc_hash_shard_count) { column->set_variant_doc_hash_shard_count(tcolumn.variant_doc_hash_shard_count); } - if (tcolumn.__isset.skip_patterns) { + if (tcolumn.__isset.skip_patterns && !tcolumn.skip_patterns.empty()) { + auto* skip_patterns = column->mutable_skip_patterns(); + skip_patterns->Reserve(cast_set(tcolumn.skip_patterns.size())); for (const auto& tsp : tcolumn.skip_patterns) { - auto* sp = column->add_skip_patterns(); - if (tsp.__isset.pattern) { - sp->set_pattern(tsp.pattern); - } - if (tsp.__isset.pattern_type) { - switch (tsp.pattern_type) { - case TPatternType::MATCH_NAME: - sp->set_pattern_type(PatternTypePB::MATCH_NAME); - break; - case TPatternType::MATCH_NAME_GLOB: - sp->set_pattern_type(PatternTypePB::MATCH_NAME_GLOB); - break; - } + // Skip invalid entries to avoid persisting empty rules. + if (!tsp.__isset.pattern || tsp.pattern.empty()) { + continue; } + auto* sp = skip_patterns->Add(); + sp->set_pattern(tsp.pattern); + sp->set_pattern_type(tsp.__isset.pattern_type + ? to_pattern_type_pb(tsp.pattern_type) + : PatternTypePB::MATCH_NAME_GLOB); } } } diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index ad73d983cb4d4f..1aba9db348a252 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -680,7 +680,12 @@ void TabletColumn::init_from_pb(const ColumnPB& column) { if (column.has_pattern_type()) { _pattern_type = column.pattern_type(); } + _variant.skip_patterns.clear(); + _variant.skip_patterns.reserve(column.skip_patterns_size()); for (const auto& sp : column.skip_patterns()) { + if (!sp.has_pattern() || sp.pattern().empty()) { + continue; + } PatternTypePB pt = sp.has_pattern_type() ? sp.pattern_type() : PatternTypePB::MATCH_NAME_GLOB; _variant.skip_patterns.emplace_back(sp.pattern(), pt); } @@ -767,8 +772,10 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const { column->set_variant_enable_doc_mode(_variant.enable_doc_mode); column->set_variant_doc_materialization_min_rows(_variant.doc_materialization_min_rows); column->set_variant_doc_hash_shard_count(_variant.doc_hash_shard_count); + auto* skip_patterns = column->mutable_skip_patterns(); + skip_patterns->Reserve(cast_set(_variant.skip_patterns.size())); for (const auto& [pattern, pt] : _variant.skip_patterns) { - auto* sp = column->add_skip_patterns(); + auto* sp = skip_patterns->Add(); sp->set_pattern(pattern); sp->set_pattern_type(pt); } diff --git a/be/src/vec/common/variant_util.cpp b/be/src/vec/common/variant_util.cpp index 6e901b27310b60..137bd5d723ce58 100644 --- a/be/src/vec/common/variant_util.cpp +++ b/be/src/vec/common/variant_util.cpp @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,7 @@ #include "olap/tablet_fwd.h" #include "olap/tablet_schema.h" #include "re2/re2.h" +#include "re2/set.h" #include "runtime/client_cache.h" #include "runtime/define_primitive_type.h" #include "runtime/exec_env.h" @@ -130,6 +132,27 @@ inline void append_escaped_regex_char(std::string* regex_output, char ch) { // Small LRU to cap compiled glob patterns constexpr size_t kGlobRegexCacheCapacity = 256; +constexpr size_t kSkipRe2SetThreshold = 32; + +struct TransparentStringHash { + using is_transparent = void; + size_t operator()(std::string_view s) const { return std::hash {}(s); } + size_t operator()(const std::string& s) const { + return std::hash {}(std::string_view(s)); + } +}; + +struct TransparentStringEq { + using is_transparent = void; + bool operator()(std::string_view lhs, std::string_view rhs) const { return lhs == rhs; } +}; + +struct CompiledSkipMatcher { + phmap::flat_hash_set exact_patterns; + std::vector> glob_regexes; + std::unique_ptr glob_regex_set; + bool use_re2_set = false; +}; struct GlobRegexCacheEntry { std::shared_ptr re2; @@ -259,6 +282,84 @@ bool glob_match_re2(const std::string& glob_pattern, const std::string& candidat return RE2::FullMatch(candidate_path, *compiled); } +Status build_compiled_skip_matcher( + const std::vector>& skip_patterns, + bool enable_re2_set, std::shared_ptr* out) { + if (out == nullptr) { + return Status::InvalidArgument("Output pointer for compiled skip matcher is null"); + } + + auto matcher = std::make_shared(); + matcher->exact_patterns.reserve(skip_patterns.size()); + + std::vector glob_regex_patterns; + glob_regex_patterns.reserve(skip_patterns.size()); + for (const auto& [pattern, pt] : skip_patterns) { + if (pt == PatternTypePB::MATCH_NAME) { + matcher->exact_patterns.insert(pattern); + continue; + } + + std::string regex_pattern; + RETURN_IF_ERROR(glob_to_regex(pattern, ®ex_pattern)); + glob_regex_patterns.emplace_back(std::move(regex_pattern)); + } + + if (glob_regex_patterns.empty()) { + *out = std::move(matcher); + return Status::OK(); + } + + if (enable_re2_set && glob_regex_patterns.size() >= kSkipRe2SetThreshold) { + RE2::Options options; + auto set = std::make_unique(options, RE2::ANCHOR_BOTH); + for (const auto& regex_pattern : glob_regex_patterns) { + if (set->Add(regex_pattern, nullptr) < 0) { + return Status::InvalidArgument( + "Failed to add regexp '{}' into skip pattern matcher set", regex_pattern); + } + } + if (!set->Compile()) { + return Status::InvalidArgument("Failed to compile skip pattern matcher set"); + } + matcher->glob_regex_set = std::move(set); + matcher->use_re2_set = true; + } else { + matcher->glob_regexes.reserve(glob_regex_patterns.size()); + for (const auto& regex_pattern : glob_regex_patterns) { + auto compiled = std::make_unique(regex_pattern); + if (!compiled->ok()) { + return Status::InvalidArgument( + "Invalid regexp '{}' generated from skip glob pattern: {}", regex_pattern, + compiled->error()); + } + matcher->glob_regexes.emplace_back(std::move(compiled)); + } + } + + *out = std::move(matcher); + return Status::OK(); +} + +bool should_skip_path(const CompiledSkipMatcher& matcher, std::string_view path) { + if (matcher.exact_patterns.find(path) != matcher.exact_patterns.end()) { + return true; + } + + if (matcher.use_re2_set) { + std::vector matched_indexes; + return matcher.glob_regex_set->Match(path, &matched_indexes); + } + + for (const auto& regex : matcher.glob_regexes) { + if (RE2::FullMatch(path, *regex)) { + return true; + } + } + + return false; +} + size_t get_number_of_dimensions(const IDataType& type) { if (const auto* type_array = typeid_cast(&type)) { return type_array->get_number_of_dimensions(); @@ -2049,6 +2150,9 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t // set skip patterns if any if (!column.variant_params().skip_patterns.empty()) { configs[i].skip_patterns = &column.variant_params().skip_patterns; + RETURN_IF_ERROR(build_compiled_skip_matcher(column.variant_params().skip_patterns, + true, + &configs[i].compiled_skip_matcher)); } // if doc mode is not enabled, no need to parse to doc value column if (!column.variant_enable_doc_mode()) { diff --git a/be/src/vec/common/variant_util.h b/be/src/vec/common/variant_util.h index e3532265056704..42cf49f2d14e92 100644 --- a/be/src/vec/common/variant_util.h +++ b/be/src/vec/common/variant_util.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -65,27 +66,35 @@ const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__"; const std::string DOC_VALUE_COLUMN_PATH = "__DORIS_VARIANT_DOC_VALUE__"; namespace doris::vectorized::variant_util { +struct CompiledSkipMatcher; + // Convert a restricted glob pattern into a regex (for tests/internal use). Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern); // Match a glob pattern against a path using RE2. bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path); +// Build an immutable matcher for skip patterns used in hot parsing paths. +Status build_compiled_skip_matcher( + const std::vector>& skip_patterns, + bool enable_re2_set, std::shared_ptr* out); + +// Match a dot-separated path against precompiled skip patterns. +bool should_skip_path(const CompiledSkipMatcher& matcher, std::string_view path); + // Check if a dot-separated path should be skipped based on skip patterns. // For MATCH_NAME_GLOB, uses glob matching; for MATCH_NAME, uses exact string comparison. inline bool should_skip_path( const std::vector>& skip_patterns, const std::string& path) { for (const auto& [pattern, pt] : skip_patterns) { - if (pt == PatternTypePB::MATCH_NAME) { - if (path == pattern) { - return true; - } - } else { - // MATCH_NAME_GLOB - if (glob_match_re2(pattern, path)) { - return true; - } + if (pt == PatternTypePB::MATCH_NAME && path == pattern) { + return true; + } + } + for (const auto& [pattern, pt] : skip_patterns) { + if (pt != PatternTypePB::MATCH_NAME && glob_match_re2(pattern, path)) { + return true; } } return false; diff --git a/be/src/vec/json/json_parser.cpp b/be/src/vec/json/json_parser.cpp index 59fb293b34a934..d36fef6fd0298f 100644 --- a/be/src/vec/json/json_parser.cpp +++ b/be/src/vec/json/json_parser.cpp @@ -48,6 +48,13 @@ std::optional JSONDataParser::parse(const char* begin, context.enable_flatten_nested = config.enable_flatten_nested; context.is_top_array = document.isArray(); context.skip_patterns = config.skip_patterns; + context.skip_matcher = config.compiled_skip_matcher.get(); + context.skip_result_cache_capacity = config.skip_result_cache_capacity; + if (context.skip_result_cache_capacity > 0 && + (context.skip_matcher != nullptr || + (context.skip_patterns != nullptr && !context.skip_patterns->empty()))) { + context.skip_cache.reserve(context.skip_result_cache_capacity); + } traverse(document, context); ParseResult result; result.values = std::move(context.values); @@ -102,21 +109,63 @@ void JSONDataParser::traverseObject(const JSONObject& object, ParseC fmt::format("Key length exceeds maximum allowed size of {} bytes.", max_key_length)); } + const bool has_skip_patterns = ctx.skip_matcher != nullptr || + (ctx.skip_patterns != nullptr && + !ctx.skip_patterns->empty()); // Check skip patterns: build the dot-separated path and test against patterns - if (ctx.skip_patterns != nullptr && !ctx.skip_patterns->empty()) { - std::string saved_path = ctx.current_path; + if (has_skip_patterns) { + const size_t old_length = ctx.current_path.size(); + const size_t required_capacity = old_length + (old_length ? 1 : 0) + key.size(); + if (ctx.current_path.capacity() < required_capacity) { + ctx.current_path.reserve(required_capacity); + } if (!ctx.current_path.empty()) { ctx.current_path.push_back('.'); } ctx.current_path.append(key.data(), key.size()); - if (variant_util::should_skip_path(*ctx.skip_patterns, ctx.current_path)) { - ctx.current_path = std::move(saved_path); + + bool is_skipped = false; + if (ctx.skip_result_cache_capacity > 0) { + auto cache_it = ctx.skip_cache.find(ctx.current_path); + if (cache_it != ctx.skip_cache.end()) { + is_skipped = cache_it->second.is_skipped; + ctx.skip_cache_lru.splice(ctx.skip_cache_lru.begin(), ctx.skip_cache_lru, + cache_it->second.lru_it); + } else { + if (ctx.skip_matcher != nullptr) { + is_skipped = variant_util::should_skip_path(*ctx.skip_matcher, ctx.current_path); + } else { + is_skipped = variant_util::should_skip_path(*ctx.skip_patterns, ctx.current_path); + } + + if (ctx.skip_cache.size() >= ctx.skip_result_cache_capacity && + !ctx.skip_cache_lru.empty()) { + const auto& evicted_key = ctx.skip_cache_lru.back(); + ctx.skip_cache.erase(evicted_key); + ctx.skip_cache_lru.pop_back(); + } + + std::string cache_key(ctx.current_path); + ctx.skip_cache_lru.push_front(cache_key); + typename ParseContext::SkipCacheEntry cache_entry; + cache_entry.is_skipped = is_skipped; + cache_entry.lru_it = ctx.skip_cache_lru.begin(); + ctx.skip_cache.emplace(std::move(cache_key), std::move(cache_entry)); + } + } else if (ctx.skip_matcher != nullptr) { + is_skipped = variant_util::should_skip_path(*ctx.skip_matcher, ctx.current_path); + } else { + is_skipped = variant_util::should_skip_path(*ctx.skip_patterns, ctx.current_path); + } + + if (is_skipped) { + ctx.current_path.resize(old_length); continue; // skip this key and its entire subtree } ctx.builder.append(key, false); traverse(value, ctx); ctx.builder.pop_back(); - ctx.current_path = std::move(saved_path); + ctx.current_path.resize(old_length); } else { ctx.builder.append(key, false); traverse(value, ctx); @@ -225,8 +274,13 @@ void JSONDataParser::traverseArrayElement(const Element& element, ParseContext element_ctx; element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten; element_ctx.is_top_array = ctx.is_top_array; + element_ctx.skip_patterns = nullptr; + element_ctx.skip_matcher = nullptr; + element_ctx.skip_result_cache_capacity = 0; traverse(element, element_ctx); - auto& [_, paths, values, flatten_nested, __, is_top_array, ___, ____] = element_ctx; + auto& paths = element_ctx.paths; + auto& values = element_ctx.values; + const bool is_top_array = element_ctx.is_top_array; if (element_ctx.has_nested_in_flatten && is_top_array) { checkAmbiguousStructure(ctx, paths); diff --git a/be/src/vec/json/json_parser.h b/be/src/vec/json/json_parser.h index 251ff4ca28446e..962bb2edcd5ec1 100644 --- a/be/src/vec/json/json_parser.h +++ b/be/src/vec/json/json_parser.h @@ -23,6 +23,8 @@ #include #include +#include +#include #include #include #include @@ -39,6 +41,9 @@ #include "vec/json/simd_json_parser.h" namespace doris::vectorized { +namespace variant_util { +struct CompiledSkipMatcher; +} template Field getValueAsField(const Element& element) { @@ -110,6 +115,10 @@ struct ParseConfig { ParseTo parse_to = ParseTo::OnlySubcolumns; // skip patterns for variant column (pointer to avoid copy; nullptr means no skip) const std::vector>* skip_patterns = nullptr; + // pre-compiled skip matcher for hot parsing path + std::shared_ptr compiled_skip_matcher = nullptr; + // per-parse cache size for "path -> skip result", 0 means disabled + uint16_t skip_result_cache_capacity = 256; }; /// Result of parsing of a document. /// Contains all paths extracted from document @@ -128,6 +137,11 @@ class JSONDataParser { private: struct ParseContext { + struct SkipCacheEntry { + bool is_skipped = false; + std::list::iterator lru_it; + }; + PathInDataBuilder builder; std::vector paths; std::vector values; @@ -136,6 +150,12 @@ class JSONDataParser { bool is_top_array = false; // skip patterns pointer (nullptr means no skip) const std::vector>* skip_patterns = nullptr; + // pre-compiled skip matcher (nullptr means use skip_patterns fallback) + const variant_util::CompiledSkipMatcher* skip_matcher = nullptr; + // max entries for skip result cache in one parse invocation + uint16_t skip_result_cache_capacity = 0; + phmap::flat_hash_map skip_cache; + std::list skip_cache_lru; // incrementally maintained dot-separated path for skip matching std::string current_path; }; diff --git a/be/test/olap/rowset/segment_v2/variant_util_test.cpp b/be/test/olap/rowset/segment_v2/variant_util_test.cpp index bb87ee0ebd7d78..4a066297147a2b 100644 --- a/be/test/olap/rowset/segment_v2/variant_util_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_util_test.cpp @@ -17,11 +17,16 @@ #include "testutil/variant_util.h" +#include +#include +#include +#include #include #include #include #include "gen_cpp/olap_file.pb.h" +#include "glog/logging.h" #include "gtest/gtest.h" #include "olap/tablet_schema.h" #include "vec/columns/column_string.h" @@ -42,6 +47,169 @@ static vectorized::ColumnString::MutablePtr _make_json_column( return col; } +static uint64_t _splitmix64(uint64_t x) { + x += 0x9e3779b97f4a7c15ULL; + x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; + x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; + return x ^ (x >> 31); +} + +static std::string _path_of_leaf_id(size_t leaf_id) { + const size_t g = leaf_id / 1000; + const size_t s = (leaf_id / 100) % 10; + const size_t t = (leaf_id / 10) % 10; + const size_t k = leaf_id % 10; + std::string path; + path.reserve(16); + path += "g"; + path.push_back(static_cast('0' + g)); + path += ".s"; + path.push_back(static_cast('0' + s)); + path += ".t"; + path.push_back(static_cast('0' + t)); + path += ".k"; + path.push_back(static_cast('0' + k)); + return path; +} + +static std::string _build_nested_json_row(size_t row_idx, uint64_t seed) { + std::string root; + root.reserve(220000); + root.push_back('{'); + bool first_g = true; + for (size_t g = 0; g < 10; ++g) { + std::string g_obj; + g_obj.push_back('{'); + bool first_s = true; + for (size_t s = 0; s < 10; ++s) { + std::string s_obj; + s_obj.push_back('{'); + bool first_t = true; + for (size_t t = 0; t < 10; ++t) { + std::string t_obj; + t_obj.push_back('{'); + bool first_k = true; + for (size_t k = 0; k < 10; ++k) { + const size_t leaf_id = ((g * 10 + s) * 10 + t) * 10 + k; + // Keep 10k nested columns per row to stress skip-pattern matching. + if (!first_k) { + t_obj.push_back(','); + } + first_k = false; + const uint64_t value = + _splitmix64(seed ^ (static_cast(row_idx) << 32) ^ leaf_id) % + 1000003ULL; + t_obj += "\"k"; + t_obj.push_back(static_cast('0' + k)); + t_obj += "\":"; + t_obj += std::to_string(value); + } + if (!first_k) { + t_obj.push_back('}'); + if (!first_t) { + s_obj.push_back(','); + } + first_t = false; + s_obj += "\"t"; + s_obj.push_back(static_cast('0' + t)); + s_obj += "\":"; + s_obj += t_obj; + } + } + if (!first_t) { + s_obj.push_back('}'); + if (!first_s) { + g_obj.push_back(','); + } + first_s = false; + g_obj += "\"s"; + g_obj.push_back(static_cast('0' + s)); + g_obj += "\":"; + g_obj += s_obj; + } + } + if (!first_s) { + g_obj.push_back('}'); + if (!first_g) { + root.push_back(','); + } + first_g = false; + root += "\"g"; + root.push_back(static_cast('0' + g)); + root += "\":"; + root += g_obj; + } + } + root += ",\"meta\":{\"row_id\":"; + root += std::to_string(row_idx); + root += ",\"rand\":"; + root += std::to_string(_splitmix64(seed + row_idx) % 9973ULL); + root += "}}"; + return root; +} + +static std::vector _build_nested_json_rows(size_t rows, uint64_t seed) { + std::vector result; + result.reserve(rows); + for (size_t i = 0; i < rows; ++i) { + result.emplace_back(_build_nested_json_row(i, seed)); + } + return result; +} + +static vectorized::ColumnString::MutablePtr _make_json_column( + const std::vector& rows) { + auto col = vectorized::ColumnString::create(); + for (const auto& row : rows) { + col->insert_data(row.data(), row.size()); + } + return col; +} + +static std::vector> _build_skip_patterns_for_perf() { + std::vector> patterns; + patterns.reserve(96); + + // Exact match patterns. + for (size_t leaf_id = 0; leaf_id < 10000; leaf_id += 211) { + patterns.emplace_back(_path_of_leaf_id(leaf_id), PatternTypePB::MATCH_NAME); + } + + // Unmatched glob patterns to amplify old per-pattern matching cost. + for (int i = 0; i < 30; ++i) { + patterns.emplace_back("x" + std::to_string(i) + "*.s?.t?.k?", + PatternTypePB::MATCH_NAME_GLOB); + } + + // Matched glob patterns. + for (size_t g = 0; g < 10; ++g) { + std::string pattern = "g"; + pattern.push_back(static_cast('0' + g)); + pattern += ".s?.t?.k[02468]"; + patterns.emplace_back(std::move(pattern), PatternTypePB::MATCH_NAME_GLOB); + } + + return patterns; +} + +struct PerfParseResult { + vectorized::ColumnVariant::MutablePtr column; + int64_t elapsed_ms = 0; +}; + +static PerfParseResult _run_parse_perf(const vectorized::ColumnString& json_column, + const vectorized::ParseConfig& config) { + auto variant = vectorized::ColumnVariant::create(0); + const auto start = std::chrono::steady_clock::now(); + parse_json_to_variant(*variant, json_column, config); + const auto end = std::chrono::steady_clock::now(); + PerfParseResult result; + result.column = std::move(variant); + result.elapsed_ms = + std::chrono::duration_cast(end - start).count(); + return result; +} + TEST(VariantUtilTest, ParseDocValueToSubcolumns_FillsDefaultsAndValues) { const std::vector jsons = { R"({"a":1,"b":"x"})", // @@ -341,4 +509,83 @@ TEST(VariantUtilTest, GlobMatchRe2) { EXPECT_FALSE(glob_match_re2("a[\\]b", "a]b")); } +TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { + if (std::getenv("DORIS_RUN_VARIANT_SKIP_PERF_UT") == nullptr) { + GTEST_SKIP() << "Set DORIS_RUN_VARIANT_SKIP_PERF_UT=1 to run this heavy perf test."; + } + + constexpr size_t kRows = 1000; + constexpr uint64_t kSeed = 0x20260211ULL; + const auto json_rows = _build_nested_json_rows(kRows, kSeed); + const auto json_column = _make_json_column(json_rows); + const auto skip_patterns = _build_skip_patterns_for_perf(); + + vectorized::ParseConfig no_skip_config; + no_skip_config.enable_flatten_nested = false; + no_skip_config.parse_to = vectorized::ParseConfig::ParseTo::OnlySubcolumns; + + vectorized::ParseConfig legacy_config; + legacy_config.enable_flatten_nested = false; + legacy_config.parse_to = vectorized::ParseConfig::ParseTo::OnlySubcolumns; + legacy_config.skip_patterns = &skip_patterns; + legacy_config.compiled_skip_matcher = nullptr; + legacy_config.skip_result_cache_capacity = 0; + + std::shared_ptr compiled_matcher; + Status st = build_compiled_skip_matcher(skip_patterns, true, &compiled_matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + + vectorized::ParseConfig optimized_config = legacy_config; + optimized_config.compiled_skip_matcher = compiled_matcher; + optimized_config.skip_result_cache_capacity = 256; + + auto no_skip_result = _run_parse_perf(*json_column, no_skip_config); + auto legacy_result = _run_parse_perf(*json_column, legacy_config); + auto optimized_result = _run_parse_perf(*json_column, optimized_config); + + ASSERT_EQ(no_skip_result.column->size(), kRows); + ASSERT_EQ(legacy_result.column->size(), kRows); + ASSERT_EQ(optimized_result.column->size(), kRows); + + vectorized::DataTypeSerDe::FormatOptions options; + bool found_no_skip_difference = false; + for (size_t row = 0; row < kRows; row += 97) { + std::string no_skip_row; + std::string legacy_row; + std::string optimized_row; + no_skip_result.column->serialize_one_row_to_string(row, &no_skip_row, options); + legacy_result.column->serialize_one_row_to_string(row, &legacy_row, options); + optimized_result.column->serialize_one_row_to_string(row, &optimized_row, options); + if (!found_no_skip_difference && no_skip_row != legacy_row) { + found_no_skip_difference = true; + } + ASSERT_EQ(legacy_row, optimized_row) << "row=" << row; + } + ASSERT_TRUE(found_no_skip_difference) + << "no-skip output should differ from skip-enabled output on sampled rows"; + + const double legacy_vs_no_skip = no_skip_result.elapsed_ms > 0 + ? static_cast(legacy_result.elapsed_ms) / + static_cast(no_skip_result.elapsed_ms) + : 0.0; + const double optimized_vs_no_skip = no_skip_result.elapsed_ms > 0 + ? static_cast(optimized_result.elapsed_ms) / + static_cast(no_skip_result.elapsed_ms) + : 0.0; + const double optimized_vs_legacy = optimized_result.elapsed_ms > 0 + ? static_cast(legacy_result.elapsed_ms) / + static_cast(optimized_result.elapsed_ms) + : 0.0; + + LOG(INFO) << "skip-pattern perf compare (1000 rows, 10k nested columns, same random data): " + << "no_skip_ms=" << no_skip_result.elapsed_ms + << ", " + << "legacy_ms=" << legacy_result.elapsed_ms + << ", optimized_ms=" << optimized_result.elapsed_ms + << ", legacy_vs_no_skip=" << legacy_vs_no_skip + << ", optimized_vs_no_skip=" << optimized_vs_no_skip + << ", optimized_vs_legacy=" << optimized_vs_legacy + << ", skip_patterns=" << skip_patterns.size(); +} + } // namespace doris::vectorized::variant_util diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 529c6f14ce05b0..2fa6e5feddc9cd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -477,7 +477,6 @@ import org.apache.doris.nereids.DorisParser.VariantContext; import org.apache.doris.nereids.DorisParser.VariantPredefinedFieldsContext; import org.apache.doris.nereids.DorisParser.VariantSubColTypeContext; -import org.apache.doris.nereids.DorisParser.VariantSubColTypeListContext; import org.apache.doris.nereids.DorisParser.VariantTypeDefinitionsContext; import org.apache.doris.nereids.DorisParser.WhereClauseContext; import org.apache.doris.nereids.DorisParser.WindowFrameContext; @@ -5209,12 +5208,6 @@ private static boolean isSupportedVariantDocModeType(DataType type) { return type.isStringLikeType() || type.isIntegralType() || type.isFloatLikeType() || type.isBooleanType(); } - @Override - public List visitVariantSubColTypeList(VariantSubColTypeListContext ctx) { - return ctx.variantSubColType().stream().map( - this::visitVariantSubColType).collect(ImmutableList.toImmutableList()); - } - @Override public VariantField visitVariantSubColType(VariantSubColTypeContext ctx) { String comment; diff --git a/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy b/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy index f30158992b4bef..30ee9613e7cff5 100644 --- a/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy +++ b/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy @@ -149,18 +149,7 @@ suite("test_schema_template_skip", "p0") { qt_skip_coexist_2 """ SELECT id, data['num_a'] FROM ${tableName9} ORDER BY id """ qt_skip_coexist_3 """ SELECT id, data['other'] FROM ${tableName9} ORDER BY id """ - // Test 10: Empty pattern rejection - test { - sql """CREATE TABLE test_skip_empty_pattern ( - `id` bigint NULL, - `data` variant NOT NULL - ) ENGINE=OLAP DUPLICATE KEY(`id`) - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" - exception "pattern" - } - - // Test 11: Bulk data — verify SKIP works correctly with larger dataset + // Test 10: Bulk data — verify SKIP works correctly with larger dataset def tableName11 = "test_skip_bulk" sql "DROP TABLE IF EXISTS ${tableName11}" sql """CREATE TABLE ${tableName11} ( From 0ae6e6e9c26d2774c6b2c9749ca39d5889a51a00 Mon Sep 17 00:00:00 2001 From: Gary Date: Fri, 13 Feb 2026 00:02:57 +0800 Subject: [PATCH 3/8] reuse thrift and protobuf structure --- be/src/olap/tablet_meta.cpp | 24 +-- be/src/olap/tablet_schema.cpp | 20 +-- be/src/olap/tablet_schema.h | 8 +- be/src/vec/common/variant_util.cpp | 82 ++++++--- be/src/vec/common/variant_util.h | 42 +++-- be/src/vec/json/json_parser.cpp | 25 +-- be/src/vec/json/json_parser.h | 10 +- .../rowset/segment_v2/variant_util_test.cpp | 39 +++-- .../java/org/apache/doris/catalog/Type.java | 4 +- .../apache/doris/catalog/VariantField.java | 28 +++- .../doris/catalog/VariantSkipPattern.java | 82 --------- .../org/apache/doris/catalog/VariantType.java | 70 ++++---- .../java/org/apache/doris/catalog/Column.java | 156 +++++++++++++----- .../org/apache/doris/catalog/OlapTable.java | 28 ++-- .../nereids/parser/LogicalPlanBuilder.java | 25 +-- .../functions/scalar/ElementAt.java | 2 +- .../plans/commands/info/CreateTableInfo.java | 4 +- .../apache/doris/nereids/types/DataType.java | 16 +- .../doris/nereids/types/VariantField.java | 27 ++- .../nereids/types/VariantSkipPattern.java | 137 --------------- .../doris/nereids/types/VariantType.java | 69 ++++---- .../ExternalFileTableValuedFunction.java | 5 +- gensrc/proto/olap_file.proto | 9 +- gensrc/thrift/Descriptors.thrift | 14 +- .../test_schema_template_skip.groovy | 19 +-- 25 files changed, 416 insertions(+), 529 deletions(-) delete mode 100644 fe/fe-common/src/main/java/org/apache/doris/catalog/VariantSkipPattern.java delete mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantSkipPattern.java diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index ec3f9ca54bb9c5..93ed6ca079bd75 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -85,14 +85,7 @@ bvar::Window> g_contains_agg_with_cache_if_eligible_full_h namespace { inline PatternTypePB to_pattern_type_pb(TPatternType::type pattern_type) { - switch (pattern_type) { - case TPatternType::MATCH_NAME: - return PatternTypePB::MATCH_NAME; - case TPatternType::MATCH_NAME_GLOB: - return PatternTypePB::MATCH_NAME_GLOB; - default: - return PatternTypePB::MATCH_NAME_GLOB; - } + return static_cast(pattern_type); } } // namespace @@ -571,21 +564,6 @@ void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tco if (tcolumn.__isset.variant_doc_hash_shard_count) { column->set_variant_doc_hash_shard_count(tcolumn.variant_doc_hash_shard_count); } - if (tcolumn.__isset.skip_patterns && !tcolumn.skip_patterns.empty()) { - auto* skip_patterns = column->mutable_skip_patterns(); - skip_patterns->Reserve(cast_set(tcolumn.skip_patterns.size())); - for (const auto& tsp : tcolumn.skip_patterns) { - // Skip invalid entries to avoid persisting empty rules. - if (!tsp.__isset.pattern || tsp.pattern.empty()) { - continue; - } - auto* sp = skip_patterns->Add(); - sp->set_pattern(tsp.pattern); - sp->set_pattern_type(tsp.__isset.pattern_type - ? to_pattern_type_pb(tsp.pattern_type) - : PatternTypePB::MATCH_NAME_GLOB); - } - } } void TabletMeta::remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version) { diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index 1aba9db348a252..b6d02ac917b510 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -678,16 +678,7 @@ void TabletColumn::init_from_pb(const ColumnPB& column) { _variant.doc_hash_shard_count = column.variant_doc_hash_shard_count(); } if (column.has_pattern_type()) { - _pattern_type = column.pattern_type(); - } - _variant.skip_patterns.clear(); - _variant.skip_patterns.reserve(column.skip_patterns_size()); - for (const auto& sp : column.skip_patterns()) { - if (!sp.has_pattern() || sp.pattern().empty()) { - continue; - } - PatternTypePB pt = sp.has_pattern_type() ? sp.pattern_type() : PatternTypePB::MATCH_NAME_GLOB; - _variant.skip_patterns.emplace_back(sp.pattern(), pt); + _field_pattern_type = column.pattern_type(); } } @@ -764,7 +755,7 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const { column->set_index_length(0); } column->set_variant_max_subcolumns_count(_variant.max_subcolumns_count); - column->set_pattern_type(_pattern_type); + column->set_pattern_type(_field_pattern_type); column->set_variant_enable_typed_paths_to_sparse(_variant.enable_typed_paths_to_sparse); column->set_variant_max_sparse_column_statistics_size( _variant.max_sparse_column_statistics_size); @@ -772,13 +763,6 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const { column->set_variant_enable_doc_mode(_variant.enable_doc_mode); column->set_variant_doc_materialization_min_rows(_variant.doc_materialization_min_rows); column->set_variant_doc_hash_shard_count(_variant.doc_hash_shard_count); - auto* skip_patterns = column->mutable_skip_patterns(); - skip_patterns->Reserve(cast_set(_variant.skip_patterns.size())); - for (const auto& [pattern, pt] : _variant.skip_patterns) { - auto* sp = skip_patterns->Add(); - sp->set_pattern(pattern); - sp->set_pattern_type(pt); - } } void TabletColumn::add_sub_column(TabletColumn& sub_column) { diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index dd75b488135613..56666c6edd4154 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -80,9 +80,6 @@ class TabletColumn : public MetadataAdder { bool enable_doc_mode = false; int64_t doc_materialization_min_rows = 0; int32_t doc_hash_shard_count = 64; - - // skip patterns for variant column - std::vector> skip_patterns; }; TabletColumn(); @@ -237,7 +234,7 @@ class TabletColumn : public MetadataAdder { _variant.max_subcolumns_count = variant_max_subcolumns_count; } - PatternTypePB pattern_type() const { return _pattern_type; } + PatternTypePB pattern_type() const { return _field_pattern_type; } bool variant_enable_typed_paths_to_sparse() const { return _variant.enable_typed_paths_to_sparse; @@ -323,7 +320,8 @@ class TabletColumn : public MetadataAdder { // The extracted sub-columns from "variant" contain the following information: int32_t _parent_col_unique_id = -1; // "variant" -> col_unique_id vectorized::PathInDataPtr _column_path; // the path of the sub-columns themselves - PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB; + // When pattern_type is absent (legacy metadata), keep typed-path default behavior. + PatternTypePB _field_pattern_type = PatternTypePB::MATCH_NAME_GLOB; VariantParams _variant; }; diff --git a/be/src/vec/common/variant_util.cpp b/be/src/vec/common/variant_util.cpp index 137bd5d723ce58..47dff23714bb96 100644 --- a/be/src/vec/common/variant_util.cpp +++ b/be/src/vec/common/variant_util.cpp @@ -283,22 +283,25 @@ bool glob_match_re2(const std::string& glob_pattern, const std::string& candidat } Status build_compiled_skip_matcher( - const std::vector>& skip_patterns, + const std::vector>& skip_path_patterns, bool enable_re2_set, std::shared_ptr* out) { if (out == nullptr) { return Status::InvalidArgument("Output pointer for compiled skip matcher is null"); } auto matcher = std::make_shared(); - matcher->exact_patterns.reserve(skip_patterns.size()); + matcher->exact_patterns.reserve(skip_path_patterns.size()); std::vector glob_regex_patterns; - glob_regex_patterns.reserve(skip_patterns.size()); - for (const auto& [pattern, pt] : skip_patterns) { - if (pt == PatternTypePB::MATCH_NAME) { + glob_regex_patterns.reserve(skip_path_patterns.size()); + for (const auto& [pattern, pt] : skip_path_patterns) { + if (is_skip_exact_path_pattern_type(pt)) { matcher->exact_patterns.insert(pattern); continue; } + if (!is_skip_glob_path_pattern_type(pt)) { + continue; + } std::string regex_pattern; RETURN_IF_ERROR(glob_to_regex(pattern, ®ex_pattern)); @@ -360,6 +363,36 @@ bool should_skip_path(const CompiledSkipMatcher& matcher, std::string_view path) return false; } +namespace { + +inline bool is_variant_skip_path_pattern_type(PatternTypePB pattern_type) { + return pattern_type == PatternTypePB::SKIP_NAME || + pattern_type == PatternTypePB::SKIP_NAME_GLOB; +} + +void collect_variant_skip_path_patterns_from_children( + const TabletColumn& column, + std::vector>* skip_path_patterns) { + skip_path_patterns->clear(); + for (const auto& sub_column : column.get_sub_columns()) { + if (!is_variant_skip_path_pattern_type(sub_column->pattern_type())) { + continue; + } + skip_path_patterns->emplace_back(sub_column->name(), sub_column->pattern_type()); + } +} + +bool has_variant_typed_path_children(const TabletColumn& column) { + for (const auto& sub_column : column.get_sub_columns()) { + if (is_typed_path_pattern_type(sub_column->pattern_type())) { + return true; + } + } + return false; +} + +} // namespace + size_t get_number_of_dimensions(const IDataType& type) { if (const auto* type_array = typeid_cast(&type)) { return type_array->get_number_of_dimensions(); @@ -565,10 +598,11 @@ Status check_variant_has_no_ambiguous_paths(const PathsInData& tuple_paths) { return Status::OK(); } -Status update_least_schema_internal(const std::map& subcolumns_types, - TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, - const std::map& typed_columns, - std::set* path_set) { +Status update_least_schema_internal( + const std::map& subcolumns_types, TabletSchemaSPtr& common_schema, + int32_t variant_col_unique_id, + const std::map& typed_path_columns, + std::set* path_set) { PathsInData tuple_paths; DataTypes tuple_types; CHECK(common_schema.use_count() == 1); @@ -604,10 +638,10 @@ Status update_least_schema_internal(const std::map& subco // Append all common type columns of this variant for (int i = 0; i < tuple_paths.size(); ++i) { TabletColumn common_column; - // typed path not contains root part + // typed path does not include root part auto path_without_root = tuple_paths[i].copy_pop_front().get_path(); - if (typed_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) { - common_column = *typed_columns.at(path_without_root); + if (typed_path_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) { + common_column = *typed_path_columns.at(path_without_root); // parent unique id and path may not be init in write path common_column.set_parent_unique_id(variant_col_unique_id); common_column.set_path_info(tuple_paths[i]); @@ -630,10 +664,13 @@ Status update_least_schema_internal(const std::map& subco Status update_least_common_schema(const std::vector& schemas, TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, std::set* path_set) { - std::map typed_columns; + std::map typed_path_columns; for (const TabletColumnPtr& col : common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) { - typed_columns[col->name()] = col; + if (!is_typed_path_pattern_type(col->pattern_type())) { + continue; + } + typed_path_columns[col->name()] = col; } // Types of subcolumns by path from all tuples. std::map subcolumns_types; @@ -657,7 +694,7 @@ Status update_least_common_schema(const std::vector& schemas, RETURN_IF_ERROR(check_variant_has_no_ambiguous_paths(all_paths)); return update_least_schema_internal(subcolumns_types, common_schema, variant_col_unique_id, - typed_columns, path_set); + typed_path_columns, path_set); } // Keep variant subcolumn BF support aligned with FE DDL checks. @@ -1317,7 +1354,8 @@ Status VariantCompactionUtil::get_extended_compaction_schema( uid_to_paths_set_info[column->unique_id()]); // 4. append subcolumns - if (column->variant_max_subcolumns_count() > 0 || !column->get_sub_columns().empty()) { + if (column->variant_max_subcolumns_count() > 0 || + has_variant_typed_path_children(*column)) { get_compaction_subcolumns_from_subpaths( uid_to_paths_set_info[column->unique_id()], column, target, uid_to_variant_extended_info[column->unique_id()].path_to_data_types, @@ -2140,6 +2178,8 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t } std::vector configs(variant_column_pos.size()); + std::vector>> variant_skip_path_patterns( + variant_column_pos.size()); for (size_t i = 0; i < variant_column_pos.size(); ++i) { configs[i].enable_flatten_nested = tablet_schema.variant_flatten_nested(); const auto& column = tablet_schema.column(variant_column_pos[i]); @@ -2147,11 +2187,11 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t return Status::InternalError("column is not variant type, column name: {}", column.name()); } - // set skip patterns if any - if (!column.variant_params().skip_patterns.empty()) { - configs[i].skip_patterns = &column.variant_params().skip_patterns; - RETURN_IF_ERROR(build_compiled_skip_matcher(column.variant_params().skip_patterns, - true, + // Set skip path patterns if configured on variant children. + collect_variant_skip_path_patterns_from_children(column, &variant_skip_path_patterns[i]); + if (!variant_skip_path_patterns[i].empty()) { + configs[i].skip_path_patterns = &variant_skip_path_patterns[i]; + RETURN_IF_ERROR(build_compiled_skip_matcher(variant_skip_path_patterns[i], true, &configs[i].compiled_skip_matcher)); } // if doc mode is not enabled, no need to parse to doc value column diff --git a/be/src/vec/common/variant_util.h b/be/src/vec/common/variant_util.h index 42cf49f2d14e92..56a9741802f703 100644 --- a/be/src/vec/common/variant_util.h +++ b/be/src/vec/common/variant_util.h @@ -68,32 +68,45 @@ namespace doris::vectorized::variant_util { struct CompiledSkipMatcher; +inline bool is_typed_path_pattern_type(PatternTypePB pattern_type) { + return pattern_type == PatternTypePB::MATCH_NAME || + pattern_type == PatternTypePB::MATCH_NAME_GLOB; +} + +inline bool is_skip_exact_path_pattern_type(PatternTypePB pattern_type) { + return pattern_type == PatternTypePB::SKIP_NAME; +} + +inline bool is_skip_glob_path_pattern_type(PatternTypePB pattern_type) { + return pattern_type == PatternTypePB::SKIP_NAME_GLOB; +} + // Convert a restricted glob pattern into a regex (for tests/internal use). Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern); // Match a glob pattern against a path using RE2. bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path); -// Build an immutable matcher for skip patterns used in hot parsing paths. +// Build an immutable matcher for skip path patterns used in hot parsing paths. Status build_compiled_skip_matcher( - const std::vector>& skip_patterns, + const std::vector>& skip_path_patterns, bool enable_re2_set, std::shared_ptr* out); -// Match a dot-separated path against precompiled skip patterns. +// Match a dot-separated path against precompiled skip path patterns. bool should_skip_path(const CompiledSkipMatcher& matcher, std::string_view path); -// Check if a dot-separated path should be skipped based on skip patterns. -// For MATCH_NAME_GLOB, uses glob matching; for MATCH_NAME, uses exact string comparison. +// Check if a dot-separated path should be skipped based on skip path patterns. +// For SKIP_NAME_GLOB, uses glob matching; for SKIP_NAME, uses exact string comparison. inline bool should_skip_path( - const std::vector>& skip_patterns, + const std::vector>& skip_path_patterns, const std::string& path) { - for (const auto& [pattern, pt] : skip_patterns) { - if (pt == PatternTypePB::MATCH_NAME && path == pattern) { + for (const auto& [pattern, pt] : skip_path_patterns) { + if (is_skip_exact_path_pattern_type(pt) && path == pattern) { return true; } } - for (const auto& [pattern, pt] : skip_patterns) { - if (pt != PatternTypePB::MATCH_NAME && glob_match_re2(pattern, path)) { + for (const auto& [pattern, pt] : skip_path_patterns) { + if (is_skip_glob_path_pattern_type(pt) && glob_match_re2(pattern, path)) { return true; } } @@ -195,10 +208,11 @@ bool inherit_index(const std::vector& parent_indexes, bool inherit_index(const std::vector& parent_indexes, TabletIndexes& sub_column_indexes, const segment_v2::ColumnMetaPB& column_pb); -Status update_least_schema_internal(const std::map& subcolumns_types, - TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, - const std::map& typed_columns, - std::set* path_set = nullptr); +Status update_least_schema_internal( + const std::map& subcolumns_types, TabletSchemaSPtr& common_schema, + int32_t variant_col_unique_id, + const std::map& typed_path_columns, + std::set* path_set = nullptr); bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, const std::string& path, diff --git a/be/src/vec/json/json_parser.cpp b/be/src/vec/json/json_parser.cpp index d36fef6fd0298f..150bb7f632f91c 100644 --- a/be/src/vec/json/json_parser.cpp +++ b/be/src/vec/json/json_parser.cpp @@ -47,12 +47,12 @@ std::optional JSONDataParser::parse(const char* begin, ParseContext context; context.enable_flatten_nested = config.enable_flatten_nested; context.is_top_array = document.isArray(); - context.skip_patterns = config.skip_patterns; + context.skip_path_patterns = config.skip_path_patterns; context.skip_matcher = config.compiled_skip_matcher.get(); context.skip_result_cache_capacity = config.skip_result_cache_capacity; if (context.skip_result_cache_capacity > 0 && (context.skip_matcher != nullptr || - (context.skip_patterns != nullptr && !context.skip_patterns->empty()))) { + (context.skip_path_patterns != nullptr && !context.skip_path_patterns->empty()))) { context.skip_cache.reserve(context.skip_result_cache_capacity); } traverse(document, context); @@ -109,11 +109,11 @@ void JSONDataParser::traverseObject(const JSONObject& object, ParseC fmt::format("Key length exceeds maximum allowed size of {} bytes.", max_key_length)); } - const bool has_skip_patterns = ctx.skip_matcher != nullptr || - (ctx.skip_patterns != nullptr && - !ctx.skip_patterns->empty()); - // Check skip patterns: build the dot-separated path and test against patterns - if (has_skip_patterns) { + const bool has_skip_path_patterns = + ctx.skip_matcher != nullptr || + (ctx.skip_path_patterns != nullptr && !ctx.skip_path_patterns->empty()); + // Check skip path patterns: build the dot-separated path and test against patterns. + if (has_skip_path_patterns) { const size_t old_length = ctx.current_path.size(); const size_t required_capacity = old_length + (old_length ? 1 : 0) + key.size(); if (ctx.current_path.capacity() < required_capacity) { @@ -133,9 +133,11 @@ void JSONDataParser::traverseObject(const JSONObject& object, ParseC cache_it->second.lru_it); } else { if (ctx.skip_matcher != nullptr) { - is_skipped = variant_util::should_skip_path(*ctx.skip_matcher, ctx.current_path); + is_skipped = + variant_util::should_skip_path(*ctx.skip_matcher, ctx.current_path); } else { - is_skipped = variant_util::should_skip_path(*ctx.skip_patterns, ctx.current_path); + is_skipped = variant_util::should_skip_path(*ctx.skip_path_patterns, + ctx.current_path); } if (ctx.skip_cache.size() >= ctx.skip_result_cache_capacity && @@ -155,7 +157,8 @@ void JSONDataParser::traverseObject(const JSONObject& object, ParseC } else if (ctx.skip_matcher != nullptr) { is_skipped = variant_util::should_skip_path(*ctx.skip_matcher, ctx.current_path); } else { - is_skipped = variant_util::should_skip_path(*ctx.skip_patterns, ctx.current_path); + is_skipped = + variant_util::should_skip_path(*ctx.skip_path_patterns, ctx.current_path); } if (is_skipped) { @@ -274,7 +277,7 @@ void JSONDataParser::traverseArrayElement(const Element& element, ParseContext element_ctx; element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten; element_ctx.is_top_array = ctx.is_top_array; - element_ctx.skip_patterns = nullptr; + element_ctx.skip_path_patterns = nullptr; element_ctx.skip_matcher = nullptr; element_ctx.skip_result_cache_capacity = 0; traverse(element, element_ctx); diff --git a/be/src/vec/json/json_parser.h b/be/src/vec/json/json_parser.h index 962bb2edcd5ec1..890288b451f13a 100644 --- a/be/src/vec/json/json_parser.h +++ b/be/src/vec/json/json_parser.h @@ -113,8 +113,8 @@ struct ParseConfig { BothSubcolumnsAndDocValueColumn = 2, }; ParseTo parse_to = ParseTo::OnlySubcolumns; - // skip patterns for variant column (pointer to avoid copy; nullptr means no skip) - const std::vector>* skip_patterns = nullptr; + // skip path patterns for variant column (pointer to avoid copy; nullptr means no skip) + const std::vector>* skip_path_patterns = nullptr; // pre-compiled skip matcher for hot parsing path std::shared_ptr compiled_skip_matcher = nullptr; // per-parse cache size for "path -> skip result", 0 means disabled @@ -148,9 +148,9 @@ class JSONDataParser { bool enable_flatten_nested = false; bool has_nested_in_flatten = false; bool is_top_array = false; - // skip patterns pointer (nullptr means no skip) - const std::vector>* skip_patterns = nullptr; - // pre-compiled skip matcher (nullptr means use skip_patterns fallback) + // skip path patterns pointer (nullptr means no skip) + const std::vector>* skip_path_patterns = nullptr; + // pre-compiled skip matcher (nullptr means use skip_path_patterns fallback) const variant_util::CompiledSkipMatcher* skip_matcher = nullptr; // max entries for skip result cache in one parse invocation uint16_t skip_result_cache_capacity = 0; diff --git a/be/test/olap/rowset/segment_v2/variant_util_test.cpp b/be/test/olap/rowset/segment_v2/variant_util_test.cpp index 4a066297147a2b..1f0056d4b2f839 100644 --- a/be/test/olap/rowset/segment_v2/variant_util_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_util_test.cpp @@ -172,13 +172,13 @@ static std::vector> _build_skip_patterns_f // Exact match patterns. for (size_t leaf_id = 0; leaf_id < 10000; leaf_id += 211) { - patterns.emplace_back(_path_of_leaf_id(leaf_id), PatternTypePB::MATCH_NAME); + patterns.emplace_back(_path_of_leaf_id(leaf_id), PatternTypePB::SKIP_NAME); } // Unmatched glob patterns to amplify old per-pattern matching cost. for (int i = 0; i < 30; ++i) { patterns.emplace_back("x" + std::to_string(i) + "*.s?.t?.k?", - PatternTypePB::MATCH_NAME_GLOB); + PatternTypePB::SKIP_NAME_GLOB); } // Matched glob patterns. @@ -186,7 +186,7 @@ static std::vector> _build_skip_patterns_f std::string pattern = "g"; pattern.push_back(static_cast('0' + g)); pattern += ".s?.t?.k[02468]"; - patterns.emplace_back(std::move(pattern), PatternTypePB::MATCH_NAME_GLOB); + patterns.emplace_back(std::move(pattern), PatternTypePB::SKIP_NAME_GLOB); } return patterns; @@ -205,8 +205,7 @@ static PerfParseResult _run_parse_perf(const vectorized::ColumnString& json_colu const auto end = std::chrono::steady_clock::now(); PerfParseResult result; result.column = std::move(variant); - result.elapsed_ms = - std::chrono::duration_cast(end - start).count(); + result.elapsed_ms = std::chrono::duration_cast(end - start).count(); return result; } @@ -527,7 +526,7 @@ TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { vectorized::ParseConfig legacy_config; legacy_config.enable_flatten_nested = false; legacy_config.parse_to = vectorized::ParseConfig::ParseTo::OnlySubcolumns; - legacy_config.skip_patterns = &skip_patterns; + legacy_config.skip_path_patterns = &skip_patterns; legacy_config.compiled_skip_matcher = nullptr; legacy_config.skip_result_cache_capacity = 0; @@ -564,22 +563,22 @@ TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { ASSERT_TRUE(found_no_skip_difference) << "no-skip output should differ from skip-enabled output on sampled rows"; - const double legacy_vs_no_skip = no_skip_result.elapsed_ms > 0 - ? static_cast(legacy_result.elapsed_ms) / - static_cast(no_skip_result.elapsed_ms) - : 0.0; - const double optimized_vs_no_skip = no_skip_result.elapsed_ms > 0 - ? static_cast(optimized_result.elapsed_ms) / - static_cast(no_skip_result.elapsed_ms) - : 0.0; - const double optimized_vs_legacy = optimized_result.elapsed_ms > 0 - ? static_cast(legacy_result.elapsed_ms) / - static_cast(optimized_result.elapsed_ms) - : 0.0; + const double legacy_vs_no_skip = + no_skip_result.elapsed_ms > 0 ? static_cast(legacy_result.elapsed_ms) / + static_cast(no_skip_result.elapsed_ms) + : 0.0; + const double optimized_vs_no_skip = + no_skip_result.elapsed_ms > 0 ? static_cast(optimized_result.elapsed_ms) / + static_cast(no_skip_result.elapsed_ms) + : 0.0; + const double optimized_vs_legacy = + optimized_result.elapsed_ms > 0 + ? static_cast(legacy_result.elapsed_ms) / + static_cast(optimized_result.elapsed_ms) + : 0.0; LOG(INFO) << "skip-pattern perf compare (1000 rows, 10k nested columns, same random data): " - << "no_skip_ms=" << no_skip_result.elapsed_ms - << ", " + << "no_skip_ms=" << no_skip_result.elapsed_ms << ", " << "legacy_ms=" << legacy_result.elapsed_ms << ", optimized_ms=" << optimized_result.elapsed_ms << ", legacy_vs_no_skip=" << legacy_vs_no_skip diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java index 3cd318aa6c5fe6..345f48cbcc70e7 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java @@ -1158,8 +1158,8 @@ public static boolean matchExactType(Type type1, Type type2, boolean ignorePreci } return true; } else if (type1.isVariantType()) { - ArrayList fields1 = ((VariantType) type1).getPredefinedFields(); - ArrayList fields2 = ((VariantType) type2).getPredefinedFields(); + ArrayList fields1 = ((VariantType) type1).getVariantTypedPathPatterns(); + ArrayList fields2 = ((VariantType) type2).getVariantTypedPathPatterns(); if (fields1.size() != fields2.size()) { return false; } diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantField.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantField.java index adb99c52cf009e..c5055026e17361 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantField.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantField.java @@ -66,15 +66,34 @@ public TPatternType getPatternType() { return patternType; } + public boolean isSkipPatternType() { + return patternType == TPatternType.SKIP_NAME || patternType == TPatternType.SKIP_NAME_GLOB; + } + + public boolean isTypedPathPatternType() { + return patternType == null + || patternType == TPatternType.MATCH_NAME + || patternType == TPatternType.MATCH_NAME_GLOB; + } + public String toSql(int depth) { StringBuilder sb = new StringBuilder(); + if (isSkipPatternType()) { + sb.append("SKIP "); + if (patternType == TPatternType.SKIP_NAME) { + sb.append("MATCH_NAME "); + } + sb.append("'").append(pattern).append("'"); + return sb.toString(); + } + if (patternType == TPatternType.MATCH_NAME) { sb.append(patternType.toString()).append(" "); } sb.append("'").append(pattern).append("'"); sb.append(":").append(type.toSql(depth + 1)); - if (!comment.isEmpty()) { + if (comment != null && !comment.isEmpty()) { sb.append(" COMMENT '").append(comment).append("'"); } return sb.toString(); @@ -98,6 +117,9 @@ public String prettyPrint(int lpad) { } public boolean matchesField(VariantField f) { + if (!isTypedPathPatternType() || !f.isTypedPathPatternType()) { + return false; + } if (equals(f)) { return true; } @@ -114,7 +136,9 @@ public boolean equals(Object other) { return false; } VariantField otherFiled = (VariantField) other; - return otherFiled.pattern.equals(pattern) && otherFiled.type.equals(type); + return otherFiled.pattern.equals(pattern) + && otherFiled.type.equals(type) + && otherFiled.patternType == patternType; } @Override diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantSkipPattern.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantSkipPattern.java deleted file mode 100644 index bf879067460965..00000000000000 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantSkipPattern.java +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.catalog; - -import org.apache.doris.thrift.TPatternType; - -import com.google.gson.annotations.SerializedName; - -import java.util.Objects; - -/** - * Catalog-layer representation of a variant SKIP pattern. - * Used for Gson persistence in FE metadata. - */ -public class VariantSkipPattern { - - @SerializedName(value = "p") - private final String pattern; - - @SerializedName(value = "pt") - private final TPatternType patternType; - - public VariantSkipPattern(String pattern, TPatternType patternType) { - this.pattern = Objects.requireNonNull(pattern, "pattern should not be null"); - this.patternType = Objects.requireNonNull(patternType, "patternType should not be null"); - } - - public String getPattern() { - return pattern; - } - - public TPatternType getPatternType() { - return patternType; - } - - public String toSql() { - StringBuilder sb = new StringBuilder(); - sb.append("SKIP "); - if (patternType == TPatternType.MATCH_NAME) { - sb.append("MATCH_NAME "); - } - sb.append("'").append(pattern).append("'"); - return sb.toString(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - VariantSkipPattern that = (VariantSkipPattern) o; - return Objects.equals(pattern, that.pattern) && patternType == that.patternType; - } - - @Override - public int hashCode() { - return Objects.hash(pattern, patternType); - } - - @Override - public String toString() { - return toSql(); - } -} diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java index 33c940075d0ffd..7c1fca95f822d7 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java @@ -38,10 +38,7 @@ public class VariantType extends ScalarType { private final HashMap fieldMap = Maps.newHashMap(); @SerializedName(value = "fields") - private final ArrayList predefinedFields; - - @SerializedName(value = "skipPatterns") - private final ArrayList skipPatterns; + private final ArrayList variantPathPatterns; @SerializedName(value = "variantMaxSubcolumnsCount") private final int variantMaxSubcolumnsCount; @@ -68,8 +65,7 @@ public class VariantType extends ScalarType { public VariantType() { super(PrimitiveType.VARIANT); - this.predefinedFields = Lists.newArrayList(); - this.skipPatterns = Lists.newArrayList(); + this.variantPathPatterns = Lists.newArrayList(); this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; @@ -82,11 +78,8 @@ public VariantType() { public VariantType(ArrayList fields) { super(PrimitiveType.VARIANT); Preconditions.checkNotNull(fields); - this.predefinedFields = fields; - this.skipPatterns = Lists.newArrayList(); - for (VariantField predefinedField : this.predefinedFields) { - fieldMap.put(predefinedField.getPattern(), predefinedField); - } + this.variantPathPatterns = fields; + addTypedPathPatternsToFieldMap(); this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; @@ -98,8 +91,7 @@ public VariantType(ArrayList fields) { public VariantType(Map properties) { super(PrimitiveType.VARIANT); - this.predefinedFields = Lists.newArrayList(); - this.skipPatterns = Lists.newArrayList(); + this.variantPathPatterns = Lists.newArrayList(); this.properties = properties; this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; @@ -113,11 +105,8 @@ public VariantType(Map properties) { public VariantType(ArrayList fields, Map properties) { super(PrimitiveType.VARIANT); Preconditions.checkNotNull(fields); - this.predefinedFields = fields; - this.skipPatterns = Lists.newArrayList(); - for (VariantField predefinedField : this.predefinedFields) { - fieldMap.put(predefinedField.getPattern(), predefinedField); - } + this.variantPathPatterns = fields; + addTypedPathPatternsToFieldMap(); this.properties = properties; this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; @@ -128,7 +117,7 @@ public VariantType(ArrayList fields, Map propertie this.variantDocShardCount = 64; } - public VariantType(ArrayList fields, ArrayList skipPatterns, + public VariantType(ArrayList variantPathPatterns, int variantMaxSubcolumnsCount, boolean enableTypedPathsToSparse, int variantMaxSparseColumnStatisticsSize, @@ -137,12 +126,9 @@ public VariantType(ArrayList fields, ArrayList long variantDocMaterializationMinRows, int variantDocShardCount) { super(PrimitiveType.VARIANT); - Preconditions.checkNotNull(fields); - this.predefinedFields = fields; - this.skipPatterns = skipPatterns != null ? skipPatterns : Lists.newArrayList(); - for (VariantField predefinedField : this.predefinedFields) { - fieldMap.put(predefinedField.getPattern(), predefinedField); - } + Preconditions.checkNotNull(variantPathPatterns); + this.variantPathPatterns = variantPathPatterns; + addTypedPathPatternsToFieldMap(); this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; this.enableTypedPathsToSparse = enableTypedPathsToSparse; this.variantMaxSparseColumnStatisticsSize = variantMaxSparseColumnStatisticsSize; @@ -152,18 +138,21 @@ public VariantType(ArrayList fields, ArrayList this.variantDocShardCount = variantDocShardCount; } + private void addTypedPathPatternsToFieldMap() { + for (VariantField pathPattern : variantPathPatterns) { + if (pathPattern.isTypedPathPatternType()) { + fieldMap.put(pathPattern.getPattern(), pathPattern); + } + } + } + @Override public String toSql(int depth) { StringBuilder sb = new StringBuilder(); sb.append("variant"); sb.append("<"); - if (!skipPatterns.isEmpty()) { - sb.append(skipPatterns.stream() - .map(VariantSkipPattern::toSql).collect(Collectors.joining(","))); - sb.append(","); - } - if (!predefinedFields.isEmpty()) { - sb.append(predefinedFields.stream() + if (!variantPathPatterns.isEmpty()) { + sb.append(variantPathPatterns.stream() .map(variantField -> variantField.toSql(depth)).collect(Collectors.joining(","))); sb.append(","); } @@ -195,12 +184,18 @@ public String toSql(int depth) { return sb.toString(); } - public ArrayList getPredefinedFields() { - return predefinedFields; + public ArrayList getVariantPathPatterns() { + return variantPathPatterns; } - public ArrayList getSkipPatterns() { - return skipPatterns; + public ArrayList getVariantTypedPathPatterns() { + ArrayList typedPathPatterns = Lists.newArrayList(); + for (VariantField variantPathPattern : variantPathPatterns) { + if (variantPathPattern.isTypedPathPatternType()) { + typedPathPatterns.add(variantPathPattern); + } + } + return typedPathPatterns; } @Override @@ -232,8 +227,7 @@ public boolean equals(Object other) { return false; } VariantType otherVariantType = (VariantType) other; - return Objects.equals(otherVariantType.getPredefinedFields(), predefinedFields) - && Objects.equals(otherVariantType.getSkipPatterns(), skipPatterns) + return Objects.equals(otherVariantType.getVariantPathPatterns(), variantPathPatterns) && variantMaxSubcolumnsCount == otherVariantType.variantMaxSubcolumnsCount && enableTypedPathsToSparse == otherVariantType.enableTypedPathsToSparse && enableVariantDocMode == otherVariantType.enableVariantDocMode diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java index 20ab041a296706..0cf3e63edecda4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java @@ -33,7 +33,6 @@ import org.apache.doris.thrift.TColumnType; import org.apache.doris.thrift.TPatternType; import org.apache.doris.thrift.TPrimitiveType; -import org.apache.doris.thrift.TSkipPattern; import com.google.common.base.Strings; import com.google.common.collect.Lists; @@ -109,6 +108,10 @@ public class Column implements GsonPostProcessable { @SerializedName(value = "comment") private String comment; @SerializedName(value = "children") + // Generic sub-columns for complex types. + // For VARIANT, this list stores both typed-path templates and skip rules. + // Caller should filter by fieldPatternType: + // MATCH_* -> typed path, SKIP_* -> skip pattern. private List children; /** * This is similar as `defaultValue`. Differences are: @@ -356,9 +359,10 @@ public void createChildrenColumn(Type type, Column column) { column.addChildrenColumn(c); } } else if (type.isVariantType() && type instanceof VariantType) { - // variant may contain predefined structured fields - ArrayList fields = ((VariantType) type).getPredefinedFields(); - for (VariantField field : fields) { + // Variant stores typed-path templates and skip patterns as sibling children, + // distinguished by fieldPatternType. + ArrayList variantPathPatterns = ((VariantType) type).getVariantPathPatterns(); + for (VariantField field : variantPathPatterns) { // set column name as pattern Column c = new Column(field.pattern, field.getType()); c.setIsAllowNull(true); @@ -372,6 +376,22 @@ public List getChildren() { return children; } + public List getVariantTypedPathChildrenOrEmpty() { + if (!(type instanceof VariantType)) { + return Lists.newArrayList(); + } + if (CollectionUtils.isEmpty(children)) { + return Lists.newArrayList(); + } + List typedPathChildren = Lists.newArrayListWithCapacity(children.size()); + for (Column child : children) { + if (isVariantTypedPathPatternType(child.fieldPatternType)) { + typedPathChildren.add(child); + } + } + return typedPathChildren; + } + private void addChildrenColumn(Column column) { if (this.children == null) { this.children = Lists.newArrayListWithExpectedSize(2); @@ -660,12 +680,6 @@ public TColumn toThrift() { tColumn.setVariantEnableDocMode(this.getVariantEnableDocMode()); tColumn.setVariantDocMaterializationMinRows(this.getvariantDocMaterializationMinRows()); tColumn.setVariantDocHashShardCount(this.getVariantDocShardCount()); - tColumn.setSkipPatterns(this.getVariantSkipPatterns().stream().map(sp -> { - TSkipPattern tsp = new TSkipPattern(); - tsp.setPattern(sp.getPattern()); - tsp.setPatternType(sp.getPatternType()); - return tsp; - }).collect(java.util.stream.Collectors.toList())); // ATTN: // Currently, this `toThrift()` method is only used from CreateReplicaTask. // And CreateReplicaTask does not need `defineExpr` field. @@ -704,23 +718,86 @@ private void setChildrenTColumn(Column children, TColumn tColumn) { toChildrenThrift(children, childrenTColumn); } - private void addChildren(Column column, TColumn tColumn) { - if (column.getChildren() != null) { - List childrenColumns = column.getChildren(); - tColumn.setChildrenColumn(new ArrayList<>()); - for (Column c : childrenColumns) { - setChildrenTColumn(c, tColumn); - } + private void appendVariantTypedPathChildren(Column column, TColumn tColumn) { + List typedPathChildren = column.getVariantTypedPathChildrenOrEmpty(); + if (typedPathChildren.isEmpty()) { + return; + } + ensureChildrenColumnInitialized(tColumn); + for (Column typedPathChild : typedPathChildren) { + setChildrenTColumn(typedPathChild, tColumn); } } - private void addChildren(OlapFile.ColumnPB.Builder builder) throws DdlException { - if (this.getChildren() != null) { - List childrenColumns = this.getChildren(); - for (Column c : childrenColumns) { - builder.addChildrenColumns(c.toPb(Sets.newHashSet(), Lists.newArrayList())); + private void appendVariantTypedPathChildren(OlapFile.ColumnPB.Builder builder) throws DdlException { + List typedPathChildren = getVariantTypedPathChildrenOrEmpty(); + if (typedPathChildren.isEmpty()) { + return; + } + for (Column typedPathChild : typedPathChildren) { + builder.addChildrenColumns(typedPathChild.toPb(Sets.newHashSet(), Lists.newArrayList())); + } + } + + private static PatternTypePB toPatternTypeForColumnPb(TPatternType patternType) { + if (patternType == null) { + return PatternTypePB.MATCH_NAME_GLOB; + } + PatternTypePB patternTypePb = PatternTypePB.forNumber(patternType.getValue()); + if (patternTypePb == null) { + throw new IllegalArgumentException("Unknown pattern type: " + patternType); + } + return patternTypePb; + } + + private static boolean isVariantTypedPathPatternType(TPatternType patternType) { + return patternType == null + || patternType == TPatternType.MATCH_NAME + || patternType == TPatternType.MATCH_NAME_GLOB; + } + + private static boolean isVariantSkipPatternType(TPatternType patternType) { + return patternType == TPatternType.SKIP_NAME || patternType == TPatternType.SKIP_NAME_GLOB; + } + + public List getVariantSkipPatternChildrenOrEmpty() { + if (!(type instanceof VariantType) || CollectionUtils.isEmpty(children)) { + return Lists.newArrayList(); + } + List skipPatternChildren = Lists.newArrayList(); + for (Column child : children) { + if (isVariantSkipPatternType(child.fieldPatternType)) { + skipPatternChildren.add(child); } } + return skipPatternChildren; + } + + private void appendVariantSkipPatternChildren(Column column, TColumn tColumn) { + List skipPatternChildren = column.getVariantSkipPatternChildrenOrEmpty(); + if (skipPatternChildren.isEmpty()) { + return; + } + ensureChildrenColumnInitialized(tColumn); + for (Column skipPatternChild : skipPatternChildren) { + setChildrenTColumn(skipPatternChild, tColumn); + } + } + + private void appendVariantSkipPatternChildren(OlapFile.ColumnPB.Builder builder) throws DdlException { + List skipPatternChildren = getVariantSkipPatternChildrenOrEmpty(); + if (skipPatternChildren.isEmpty()) { + return; + } + for (Column skipPatternChild : skipPatternChildren) { + builder.addChildrenColumns(skipPatternChild.toPb(Sets.newHashSet(), Lists.newArrayList())); + } + } + + private static void ensureChildrenColumnInitialized(TColumn tColumn) { + if (tColumn.children_column == null) { + tColumn.setChildrenColumn(new ArrayList<>()); + } } private void toChildrenThrift(Column column, TColumn tColumn) { @@ -741,8 +818,10 @@ private void toChildrenThrift(Column column, TColumn tColumn) { setChildrenTColumn(children, tColumn); } } else if (column.type.isVariantType()) { - // variant may contain predefined structured fields - addChildren(column, tColumn); + // Variant children are persisted as two peer groups: + // 1) typed path schema templates, 2) skip pattern rules. + appendVariantTypedPathChildren(column, tColumn); + appendVariantSkipPatternChildren(column, tColumn); } } @@ -826,11 +905,7 @@ public OlapFile.ColumnPB toPb(Set bfColumns, List indexes) throws builder.setType(this.getDataType().toThrift().name()); builder.setIsKey(this.isKey); if (fieldPatternType != null) { - if (fieldPatternType == TPatternType.MATCH_NAME) { - builder.setPatternType(PatternTypePB.MATCH_NAME); - } else { - builder.setPatternType(PatternTypePB.MATCH_NAME_GLOB); - } + builder.setPatternType(toPatternTypeForColumnPb(fieldPatternType)); } if (null != this.aggregationType) { if (type.isAggStateType()) { @@ -893,15 +968,9 @@ public OlapFile.ColumnPB toPb(Set bfColumns, List indexes) throws builder.setVariantEnableDocMode(this.getVariantEnableDocMode()); builder.setVariantDocMaterializationMinRows(this.getvariantDocMaterializationMinRows()); builder.setVariantDocHashShardCount(this.getVariantDocShardCount()); - builder.addAllSkipPatterns(this.getVariantSkipPatterns().stream().map(sp -> - OlapFile.SkipPatternPB.newBuilder() - .setPattern(sp.getPattern()) - .setPatternType(sp.getPatternType() == TPatternType.MATCH_NAME - ? PatternTypePB.MATCH_NAME : PatternTypePB.MATCH_NAME_GLOB) - .build() - ).collect(java.util.stream.Collectors.toList())); - // variant may contain predefined structured fields - addChildren(builder); + // Keep typed paths and skip rules as sibling children entries. + appendVariantTypedPathChildren(builder); + appendVariantSkipPatternChildren(builder); } OlapFile.ColumnPB col = builder.build(); @@ -991,9 +1060,14 @@ public void checkSchemaChangeAllowed(Column other) throws DdlException { if (this.getVariantDocShardCount() != other.getVariantDocShardCount()) { throw new DdlException("Can not change variant doc snapshot shard count"); } - if (CollectionUtils.isNotEmpty(this.getChildren()) || CollectionUtils.isNotEmpty(other.getChildren())) { + if (CollectionUtils.isNotEmpty(this.getVariantTypedPathChildrenOrEmpty()) + || CollectionUtils.isNotEmpty(other.getVariantTypedPathChildrenOrEmpty())) { throw new DdlException("Can not change variant schema templates"); } + if (CollectionUtils.isNotEmpty(this.getVariantSkipPatternChildrenOrEmpty()) + || CollectionUtils.isNotEmpty(other.getVariantSkipPatternChildrenOrEmpty())) { + throw new DdlException("Can not change variant skip patterns"); + } } } @@ -1350,10 +1424,6 @@ public int getVariantDocShardCount() { return type.isVariantType() ? ((ScalarType) type).getVariantDocShardCount() : 128; } - public ArrayList getVariantSkipPatterns() { - return type.isVariantType() ? ((VariantType) type).getSkipPatterns() : Lists.newArrayList(); - } - public void setFieldPatternType(TPatternType type) { fieldPatternType = type; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 07c550a639721d..e1c9b96ed13a28 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -3743,25 +3743,23 @@ public Index getInvertedIndex(Column column, List subPath, String analyz : filteredInvertedIndexes.stream().filter(Index::isAnalyzedInvertedIndex).findFirst().orElse(null); } - // subPath is not empty, means it is a variant column, find the field pattern from children + // subPath is not empty, means it is a variant column, find the field pattern from typed-path templates String subPathString = String.join(".", subPath); String fieldPattern = ""; - if (column.getChildren() != null) { - for (Column child : column.getChildren()) { - String childName = child.getName(); - if (child.getFieldPatternType() == TPatternType.MATCH_NAME_GLOB) { - try { - com.google.re2j.Pattern compiled = GlobRegexUtil.getOrCompilePattern(childName); - if (compiled.matcher(subPathString).matches()) { - fieldPattern = childName; - } - } catch (com.google.re2j.PatternSyntaxException | IllegalArgumentException e) { - continue; - } - } else if (child.getFieldPatternType() == TPatternType.MATCH_NAME) { - if (childName.equals(subPathString)) { + for (Column child : column.getVariantTypedPathChildrenOrEmpty()) { + String childName = child.getName(); + if (child.getFieldPatternType() == TPatternType.MATCH_NAME_GLOB) { + try { + com.google.re2j.Pattern compiled = GlobRegexUtil.getOrCompilePattern(childName); + if (compiled.matcher(subPathString).matches()) { fieldPattern = childName; } + } catch (com.google.re2j.PatternSyntaxException | IllegalArgumentException e) { + continue; + } + } else if (child.getFieldPatternType() == TPatternType.MATCH_NAME) { + if (childName.equals(subPathString)) { + fieldPattern = childName; } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 2fa6e5feddc9cd..5982889874e6c9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -1060,11 +1060,11 @@ import org.apache.doris.nereids.types.DateV2Type; import org.apache.doris.nereids.types.LargeIntType; import org.apache.doris.nereids.types.MapType; +import org.apache.doris.nereids.types.StringType; import org.apache.doris.nereids.types.StructField; import org.apache.doris.nereids.types.StructType; import org.apache.doris.nereids.types.VarcharType; import org.apache.doris.nereids.types.VariantField; -import org.apache.doris.nereids.types.VariantSkipPattern; import org.apache.doris.nereids.types.VariantType; import org.apache.doris.nereids.types.coercion.CharacterType; import org.apache.doris.nereids.util.ExpressionUtils; @@ -5100,20 +5100,20 @@ public DataType visitVariantPredefinedFields(VariantPredefinedFieldsContext ctx) "Unsupported variant definition: " + variantDef.getText()); VariantContext variantCtx = (VariantContext) variantDef; - List fields = Lists.newArrayList(); - List skipPatterns = Lists.newArrayList(); + List variantPathPatterns = Lists.newArrayList(); if (variantCtx.variantSubColTypeList() != null) { for (VariantSubColTypeContext subCtx : variantCtx.variantSubColTypeList().variantSubColType()) { if (subCtx.SKIP_() != null) { String skipPattern = subCtx.STRING_LITERAL().getText(); skipPattern = skipPattern.substring(1, skipPattern.length() - 1); - String matchType = subCtx.variantSubColMatchType() != null - ? subCtx.variantSubColMatchType().getText() : null; - skipPatterns.add(matchType != null - ? new VariantSkipPattern(skipPattern, matchType) - : new VariantSkipPattern(skipPattern)); + String skipMatchType = subCtx.variantSubColMatchType() == null + ? null + : subCtx.variantSubColMatchType().getText(); + String skipPatternType = "MATCH_NAME".equalsIgnoreCase(skipMatchType) + ? "SKIP_NAME" : "SKIP_NAME_GLOB"; + variantPathPatterns.add(new VariantField(skipPattern, StringType.INSTANCE, "", skipPatternType)); } else { - fields.add(visitVariantSubColType(subCtx)); + variantPathPatterns.add(visitVariantSubColType(subCtx)); } } } @@ -5168,7 +5168,10 @@ public DataType visitVariantPredefinedFields(VariantPredefinedFieldsContext ctx) variantSparseHashShardCount = 0; // Validate that all typed fields use data types supported in doc mode // document mode only supports string, integral, float, and boolean types - for (VariantField field : fields) { + for (VariantField field : variantPathPatterns) { + if (field.isSkipPatternType()) { + continue; + } DataType dataType = field.getDataType(); if (dataType.isArrayType()) { ArrayType arrayType = (ArrayType) dataType; @@ -5199,7 +5202,7 @@ public DataType visitVariantPredefinedFields(VariantPredefinedFieldsContext ctx) + " and " + PropertyAnalyzer.PROPERTIES_VARIANT_DOC_HASH_SHARD_COUNT); } - return new VariantType(fields, skipPatterns, variantMaxSubcolumnsCount, enableTypedPathsToSparse, + return new VariantType(variantPathPatterns, variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocHashShardCount); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java index dd715c3c54d065..26563c33d9466f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java @@ -104,7 +104,7 @@ public FunctionSignature computeSignature(FunctionSignature signature) { DataType expressionType = arguments.get(0).getDataType(); DataType sigType = signature.argumentsTypes.get(0); if (expressionType instanceof VariantType && sigType instanceof VariantType) { - // Preserve predefinedFields for schema template matching + // Preserve variant typed path patterns for schema template matching. VariantType originalType = (VariantType) expressionType; signature = signature.withArgumentType(0, originalType); signature = signature.withReturnType(originalType); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java index 0881884105f4a8..fff54f8244c0e0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java @@ -1407,8 +1407,8 @@ private void columnToIndexesCheck() { } boolean findFieldPattern = false; VariantType variantType = (VariantType) column.getType(); - List predefinedFields = variantType.getPredefinedFields(); - for (VariantField field : predefinedFields) { + List typedPathPatterns = variantType.getVariantTypedPathPatterns(); + for (VariantField field : typedPathPatterns) { if (field.getPattern().equals(fieldPattern)) { findFieldPattern = true; if (!IndexDefinition.isSupportIdxType(field.getDataType())) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java index 3d1283b6199275..16c1374c0b7b4b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java @@ -480,16 +480,12 @@ public static DataType fromCatalogType(Type type) { // In the past, variant metadata used the ScalarType type. // Now, we use VariantType, which inherits from ScalarType, as the new metadata storage. if (type instanceof org.apache.doris.catalog.VariantType) { - List variantFields = ((org.apache.doris.catalog.VariantType) type) - .getPredefinedFields().stream() + List variantPathPatterns = ((org.apache.doris.catalog.VariantType) type) + .getVariantPathPatterns().stream() .map(cf -> new VariantField(cf.getPattern(), fromCatalogType(cf.getType()), cf.getComment() == null ? "" : cf.getComment(), cf.getPatternType().toString())) .collect(ImmutableList.toImmutableList()); - List variantSkipPatterns = ((org.apache.doris.catalog.VariantType) type) - .getSkipPatterns().stream() - .map(sp -> new VariantSkipPattern(sp.getPattern(), sp.getPatternType().name())) - .collect(ImmutableList.toImmutableList()); - return new VariantType(variantFields, variantSkipPatterns, + return new VariantType(variantPathPatterns, ((org.apache.doris.catalog.VariantType) type).getVariantMaxSubcolumnsCount(), ((org.apache.doris.catalog.VariantType) type).getEnableTypedPathsToSparse(), ((org.apache.doris.catalog.VariantType) type).getVariantMaxSparseColumnStatisticsSize(), @@ -1117,10 +1113,10 @@ private static void validateScalarType(ScalarType scalarType) { break; } case VARIANT: { - ArrayList predefinedFields = - ((org.apache.doris.catalog.VariantType) scalarType).getPredefinedFields(); + ArrayList typedPathPatterns = + ((org.apache.doris.catalog.VariantType) scalarType).getVariantTypedPathPatterns(); Set fieldPatterns = new HashSet<>(); - for (org.apache.doris.catalog.VariantField field : predefinedFields) { + for (org.apache.doris.catalog.VariantField field : typedPathPatterns) { Type fieldType = field.getType(); validateNestedType(scalarType, fieldType); if (!fieldPatterns.add(field.getPattern())) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java index a8e3bd9ded136b..8bdf96cf1a13f0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java @@ -53,6 +53,10 @@ public VariantField(String pattern, DataType dataType, String comment, String pa TPatternType type; if (TPatternType.MATCH_NAME.name().equalsIgnoreCase(patternType)) { type = TPatternType.MATCH_NAME; + } else if (TPatternType.SKIP_NAME.name().equalsIgnoreCase(patternType)) { + type = TPatternType.SKIP_NAME; + } else if (TPatternType.SKIP_NAME_GLOB.name().equalsIgnoreCase(patternType)) { + type = TPatternType.SKIP_NAME_GLOB; } else { type = TPatternType.MATCH_NAME_GLOB; } @@ -71,6 +75,14 @@ public String getComment() { return comment; } + public boolean isSkipPatternType() { + return patternType == TPatternType.SKIP_NAME || patternType == TPatternType.SKIP_NAME_GLOB; + } + + public boolean isTypedPathPatternType() { + return patternType == TPatternType.MATCH_NAME || patternType == TPatternType.MATCH_NAME_GLOB; + } + /** * Check if the given field name matches this field's pattern. * This method uses a restricted glob syntax converted to regex. @@ -86,6 +98,9 @@ public String getComment() { * @return true if the field name matches the pattern */ public boolean matches(String fieldName) { + if (!isTypedPathPatternType()) { + return false; + } if (patternType == TPatternType.MATCH_NAME) { return pattern.equals(fieldName); } @@ -111,6 +126,14 @@ public org.apache.doris.catalog.VariantField toCatalogDataType() { */ public String toSql() { StringBuilder sb = new StringBuilder(); + if (isSkipPatternType()) { + sb.append("SKIP "); + if (patternType == TPatternType.SKIP_NAME) { + sb.append("MATCH_NAME "); + } + sb.append("'").append(pattern).append("'"); + return sb.toString(); + } if (patternType == TPatternType.MATCH_NAME) { sb.append(patternType.toString()).append(" "); } @@ -137,12 +160,12 @@ public boolean equals(Object o) { } VariantField that = (VariantField) o; return Objects.equals(pattern, that.pattern) && Objects.equals(dataType, - that.dataType); + that.dataType) && patternType == that.patternType; } @Override public int hashCode() { - return Objects.hash(pattern, dataType); + return Objects.hash(pattern, dataType, patternType); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantSkipPattern.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantSkipPattern.java deleted file mode 100644 index 3dd6b2bb7f0217..00000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantSkipPattern.java +++ /dev/null @@ -1,137 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.nereids.types; - -import org.apache.doris.common.GlobRegexUtil; -import org.apache.doris.thrift.TPatternType; - -import com.google.re2j.Pattern; -import com.google.re2j.PatternSyntaxException; - -import java.util.Objects; - -/** - * A skip pattern inside a VariantType. - * Specifies field paths that should be irreversibly pruned during data ingestion. - */ -public class VariantSkipPattern { - private final String pattern; - private final TPatternType patternType; - - /** - * VariantSkipPattern Constructor with default MATCH_NAME_GLOB pattern type. - */ - public VariantSkipPattern(String pattern) { - this(pattern, TPatternType.MATCH_NAME_GLOB.name()); - } - - /** - * VariantSkipPattern Constructor. - * Validates glob patterns at DDL time — invalid globs are rejected immediately. - * - * @param pattern the glob or exact pattern string - * @param patternType "MATCH_NAME" for exact match, otherwise MATCH_NAME_GLOB - */ - public VariantSkipPattern(String pattern, String patternType) { - this.pattern = Objects.requireNonNull(pattern, "pattern should not be null"); - TPatternType type; - if (TPatternType.MATCH_NAME.name().equalsIgnoreCase(patternType)) { - type = TPatternType.MATCH_NAME; - } else { - type = TPatternType.MATCH_NAME_GLOB; - } - this.patternType = type; - // DDL-time validation: compile glob to catch syntax errors early - if (this.patternType == TPatternType.MATCH_NAME_GLOB) { - try { - GlobRegexUtil.getOrCompilePattern(this.pattern); - } catch (PatternSyntaxException | IllegalArgumentException e) { - throw new IllegalArgumentException( - "Invalid glob pattern for SKIP: '" + this.pattern + "': " + e.getMessage(), e); - } - } - } - - public String getPattern() { - return pattern; - } - - public TPatternType getPatternType() { - return patternType; - } - - /** - * Check if the given field path matches this skip pattern. - * Note: This method is currently unused in FE. The actual skip pattern matching - * is performed in BE's JSON parser (should_skip_path) during data ingestion. - * Kept here for potential future FE-side validation or testing use. - */ - public boolean matches(String fieldPath) { - if (patternType == TPatternType.MATCH_NAME) { - return pattern.equals(fieldPath); - } - try { - Pattern compiled = GlobRegexUtil.getOrCompilePattern(pattern); - return compiled.matcher(fieldPath).matches(); - } catch (PatternSyntaxException | IllegalArgumentException e) { - return false; - } - } - - /** - * Convert to SQL string representation. - */ - public String toSql() { - StringBuilder sb = new StringBuilder(); - sb.append("SKIP "); - if (patternType == TPatternType.MATCH_NAME) { - sb.append("MATCH_NAME "); - } - sb.append("'").append(pattern).append("'"); - return sb.toString(); - } - - /** - * Convert to Catalog layer type. - */ - public org.apache.doris.catalog.VariantSkipPattern toCatalogType() { - return new org.apache.doris.catalog.VariantSkipPattern(pattern, patternType); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - VariantSkipPattern that = (VariantSkipPattern) o; - return Objects.equals(pattern, that.pattern) && patternType == that.patternType; - } - - @Override - public int hashCode() { - return Objects.hash(pattern, patternType); - } - - @Override - public String toString() { - return toSql(); - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java index 4f005df31f3605..0fe92b8c188376 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java @@ -49,8 +49,7 @@ public class VariantType extends PrimitiveType { private final int variantMaxSparseColumnStatisticsSize; - private final List predefinedFields; - private final List skipPatterns; + private final List variantPathPatterns; private final int variantSparseHashShardCount; private final boolean enableVariantDocMode; @@ -58,14 +57,13 @@ public class VariantType extends PrimitiveType { private final int variantDocShardCount; /** - * Creates a Variant type without predefined fields and only configures the max subcolumn limit. + * Creates a Variant type without variant path patterns and only configures the max subcolumn limit. * * @param variantMaxSubcolumnsCount max number of subcolumns allowed (0 means unlimited) */ public VariantType(int variantMaxSubcolumnsCount) { this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; - this.predefinedFields = Lists.newArrayList(); - this.skipPatterns = Lists.newArrayList(); + this.variantPathPatterns = Lists.newArrayList(); this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; this.variantSparseHashShardCount = 0; @@ -75,11 +73,10 @@ public VariantType(int variantMaxSubcolumnsCount) { } /** - * Contains predefined fields like struct + * Variant path patterns, including typed paths and skip rules. */ public VariantType(List fields) { - this.predefinedFields = ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be null")); - this.skipPatterns = Lists.newArrayList(); + this.variantPathPatterns = ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be null")); this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; @@ -90,9 +87,9 @@ public VariantType(List fields) { } /** - * Creates a Variant type with predefined fields and advanced optional properties. + * Creates a Variant type with variant path patterns and advanced optional properties. * - * @param fields predefined variant path fields + * @param variantPathPatterns variant path patterns, including typed paths and skip rules * @param variantMaxSubcolumnsCount max number of subcolumns allowed * @param enableTypedPathsToSparse whether typed paths should be materialized as sparse columns * @param variantMaxSparseColumnStatisticsSize upper bound of sparse path statistics entries @@ -100,14 +97,13 @@ public VariantType(List fields) { * @param enableVariantDocMode whether to enable variant doc snapshot writing mode * @param variantDocMaterializationMinRows minimum rows to generate doc snapshot columns */ - public VariantType(List fields, List skipPatterns, + public VariantType(List variantPathPatterns, int variantMaxSubcolumnsCount, boolean enableTypedPathsToSparse, int variantMaxSparseColumnStatisticsSize, int variantSparseHashShardCount, boolean enableVariantDocMode, long variantDocMaterializationMinRows, int variantDocShardCount) { - this.predefinedFields = ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be null")); - this.skipPatterns = ImmutableList.copyOf( - Objects.requireNonNull(skipPatterns, "skipPatterns should not be null")); + this.variantPathPatterns = ImmutableList.copyOf( + Objects.requireNonNull(variantPathPatterns, "variantPathPatterns should not be null")); this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; this.enableTypedPathsToSparse = enableTypedPathsToSparse; this.variantMaxSparseColumnStatisticsSize = variantMaxSparseColumnStatisticsSize; @@ -119,8 +115,8 @@ public VariantType(List fields, List skipPatte @Override public DataType conversion() { - return new VariantType(predefinedFields.stream().map(VariantField::conversion) - .collect(Collectors.toList()), skipPatterns, + return new VariantType(variantPathPatterns.stream().map(VariantField::conversion) + .collect(Collectors.toList()), variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, @@ -129,12 +125,10 @@ public DataType conversion() { @Override public Type toCatalogDataType() { - org.apache.doris.catalog.VariantType type = new org.apache.doris.catalog.VariantType(predefinedFields.stream() + org.apache.doris.catalog.VariantType type = + new org.apache.doris.catalog.VariantType(variantPathPatterns.stream() .map(VariantField::toCatalogDataType) .collect(Collectors.toCollection(ArrayList::new)), - skipPatterns.stream() - .map(VariantSkipPattern::toCatalogType) - .collect(Collectors.toCollection(ArrayList::new)), variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocShardCount); @@ -151,12 +145,8 @@ public String toSql() { StringBuilder sb = new StringBuilder(); sb.append("variant"); sb.append("<"); - if (!skipPatterns.isEmpty()) { - sb.append(skipPatterns.stream().map(VariantSkipPattern::toSql).collect(Collectors.joining(","))); - sb.append(","); - } - if (!predefinedFields.isEmpty()) { - sb.append(predefinedFields.stream().map(VariantField::toSql).collect(Collectors.joining(","))); + if (!variantPathPatterns.isEmpty()) { + sb.append(variantPathPatterns.stream().map(VariantField::toSql).collect(Collectors.joining(","))); sb.append(","); } @@ -202,8 +192,7 @@ public boolean equals(Object o) { && this.enableTypedPathsToSparse == other.enableTypedPathsToSparse && this.enableVariantDocMode == other.enableVariantDocMode && this.variantDocMaterializationMinRows == other.variantDocMaterializationMinRows - && Objects.equals(predefinedFields, other.predefinedFields) - && Objects.equals(skipPatterns, other.skipPatterns); + && Objects.equals(variantPathPatterns, other.variantPathPatterns); } @Override @@ -215,12 +204,14 @@ public boolean equalsForRecursiveCte(Object o) { return false; } VariantType other = (VariantType) o; - if (predefinedFields.size() != other.predefinedFields.size()) { + List typedPathPatterns = getVariantTypedPathPatterns(); + List otherTypedPathPatterns = other.getVariantTypedPathPatterns(); + if (typedPathPatterns.size() != otherTypedPathPatterns.size()) { return false; } - for (int i = 0; i < predefinedFields.size(); ++i) { - if (!predefinedFields.get(i).getDataType() - .equalsForRecursiveCte(other.predefinedFields.get(i).getDataType())) { + for (int i = 0; i < typedPathPatterns.size(); ++i) { + if (!typedPathPatterns.get(i).getDataType() + .equalsForRecursiveCte(otherTypedPathPatterns.get(i).getDataType())) { return false; } } @@ -232,7 +223,7 @@ public int hashCode() { return Objects.hash(super.hashCode(), variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocShardCount, - predefinedFields, skipPatterns); + variantPathPatterns); } @Override @@ -245,12 +236,14 @@ public String toString() { return toSql(); } - public List getPredefinedFields() { - return predefinedFields; + public List getVariantPathPatterns() { + return variantPathPatterns; } - public List getSkipPatterns() { - return skipPatterns; + public List getVariantTypedPathPatterns() { + return variantPathPatterns.stream() + .filter(VariantField::isTypedPathPatternType) + .collect(ImmutableList.toImmutableList()); } /** @@ -261,7 +254,7 @@ public List getSkipPatterns() { * @return Optional containing the matching VariantField, or empty if no match */ public Optional findMatchingField(String fieldName) { - for (VariantField field : predefinedFields) { + for (VariantField field : getVariantTypedPathPatterns()) { if (field.matches(fieldName)) { return Optional.of(field); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index d32b50f9561a76..472299f64a9155 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -394,9 +394,9 @@ private Pair getColumnType(List typeNodes, int start) } else if (tPrimitiveType == TPrimitiveType.VARIANT) { // Preserve VARIANT-specific properties from PTypeNode, especially variant_max_subcolumns_count. int maxSubcolumns = typeNode.getVariantMaxSubcolumnsCount(); - // Currently no predefined fields are carried in PTypeNode for VARIANT, so use empty list and default + // Currently no variant path patterns are carried in PTypeNode for VARIANT, so use empty list and default // values for other properties. - type = new VariantType(new ArrayList<>(), new ArrayList<>(), maxSubcolumns, + type = new VariantType(new ArrayList<>(), maxSubcolumns, /*enableTypedPathsToSparse*/ false, /*variantMaxSparseColumnStatisticsSize*/ 10000, /*variantSparseHashShardCount*/ 0, @@ -555,4 +555,3 @@ public void checkAuth(ConnectContext ctx) { } } } - diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index f599ff64fc0593..0ae4163dbe55fd 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -346,11 +346,8 @@ message AlterTabletPB { enum PatternTypePB { MATCH_NAME = 1; MATCH_NAME_GLOB = 2; -} - -message SkipPatternPB { - optional string pattern = 1; - optional PatternTypePB pattern_type = 2; + SKIP_NAME = 3; + SKIP_NAME_GLOB = 4; } message ColumnPB { @@ -395,8 +392,6 @@ message ColumnPB { optional int64 variant_doc_materialization_min_rows = 32; // Number of buckets used to store doc map in variant doc mode. optional int32 variant_doc_hash_shard_count = 33 [default = 64]; - // skip patterns for variant column - repeated SkipPatternPB skip_patterns = 34; } // Dictionary of Schema info, to reduce TabletSchemaCloudPB fdb kv size diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift index eadcbdf77325d0..df4a50956fbff3 100644 --- a/gensrc/thrift/Descriptors.thrift +++ b/gensrc/thrift/Descriptors.thrift @@ -24,7 +24,9 @@ include "Partitions.thrift" enum TPatternType { MATCH_NAME = 1, - MATCH_NAME_GLOB = 2 + MATCH_NAME_GLOB = 2, + SKIP_NAME = 3, + SKIP_NAME_GLOB = 4 } enum TAccessPathType { @@ -70,11 +72,6 @@ struct TColumnAccessPath { 3: optional TMetaAccessPath meta_access_path } -struct TSkipPattern { - 1: optional string pattern - 2: optional TPatternType pattern_type -} - struct TColumn { 1: required string column_name 2: required Types.TColumnType column_type @@ -102,9 +99,8 @@ struct TColumn { 24: optional i32 variant_max_sparse_column_statistics_size = 10000 25: optional i32 variant_sparse_hash_shard_count 26: optional bool variant_enable_doc_mode - 27: optional i64 variant_doc_materialization_min_rows - 28: optional i32 variant_doc_hash_shard_count - 29: optional list skip_patterns + 27: optional i64 variant_doc_materialization_min_rows + 28: optional i32 variant_doc_hash_shard_count } struct TSlotDescriptor { diff --git a/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy b/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy index 30ee9613e7cff5..e1a907688b16cc 100644 --- a/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy +++ b/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy @@ -86,16 +86,15 @@ suite("test_schema_template_skip", "p0") { qt_skip_priority_1 """ SELECT id, data['num_a'] FROM ${tableName4} ORDER BY id """ qt_skip_priority_2 """ SELECT id, data['other'] FROM ${tableName4} ORDER BY id """ - // Test 5: Invalid glob DDL rejection - test { - sql """CREATE TABLE test_skip_invalid_glob ( - `id` bigint NULL, - `data` variant NOT NULL - ) ENGINE=OLAP DUPLICATE KEY(`id`) - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" - exception "Invalid glob pattern" - } + // Test 5: Invalid skip glob is allowed in DDL (same behavior as typed path) + def tableName5 = "test_skip_invalid_glob" + sql "DROP TABLE IF EXISTS ${tableName5}" + sql """CREATE TABLE ${tableName5} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" // Test 6: Glob cross-level matching — pattern spans nested path def tableName6 = "test_skip_glob_cross_level" From 2689dc6f8688b2b951028b9f6006ee2b7de48e9c Mon Sep 17 00:00:00 2001 From: Gary Date: Fri, 13 Feb 2026 00:48:16 +0800 Subject: [PATCH 4/8] enhance tests --- be/src/vec/common/variant_util.cpp | 5 +- .../rowset/segment_v2/variant_util_test.cpp | 139 ++++++++++++++++++ be/test/olap/tablet_schema_test.cpp | 45 ++++++ be/test/vec/common/schema_util_test.cpp | 6 +- be/test/vec/jsonb/json_parser_test.cpp | 94 ++++++++++++ .../org/apache/doris/catalog/ColumnTest.java | 63 ++++++++ .../org/apache/doris/catalog/TypeTest.java | 39 +++++ .../nereids/parser/NereidsParserTest.java | 75 ++++++++++ .../nereids/types/VariantFieldMatchTest.java | 66 +++++++++ .../predefine/test_schema_template_skip.out | 6 + .../test_schema_template_skip.groovy | 4 + 11 files changed, 538 insertions(+), 4 deletions(-) diff --git a/be/src/vec/common/variant_util.cpp b/be/src/vec/common/variant_util.cpp index 47dff23714bb96..41d06e84ddc38f 100644 --- a/be/src/vec/common/variant_util.cpp +++ b/be/src/vec/common/variant_util.cpp @@ -304,7 +304,10 @@ Status build_compiled_skip_matcher( } std::string regex_pattern; - RETURN_IF_ERROR(glob_to_regex(pattern, ®ex_pattern)); + auto st = glob_to_regex(pattern, ®ex_pattern); + if (!st.ok()) { + continue; + } glob_regex_patterns.emplace_back(std::move(regex_pattern)); } diff --git a/be/test/olap/rowset/segment_v2/variant_util_test.cpp b/be/test/olap/rowset/segment_v2/variant_util_test.cpp index 1f0056d4b2f839..5568b88ffda25e 100644 --- a/be/test/olap/rowset/segment_v2/variant_util_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_util_test.cpp @@ -508,6 +508,145 @@ TEST(VariantUtilTest, GlobMatchRe2) { EXPECT_FALSE(glob_match_re2("a[\\]b", "a]b")); } +TEST(VariantUtilTest, ShouldSkipPathLegacyPatterns) { + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + {"debug_*", PatternTypePB::SKIP_NAME_GLOB}, + {"typed_*", PatternTypePB::MATCH_NAME_GLOB}, + }; + + EXPECT_TRUE(should_skip_path(skip_patterns, "secret")); + EXPECT_TRUE(should_skip_path(skip_patterns, "debug_field")); + EXPECT_FALSE(should_skip_path(skip_patterns, "typed_field")); + EXPECT_FALSE(should_skip_path(skip_patterns, "other")); +} + +TEST(VariantUtilTest, PatternTypeHelpers) { + EXPECT_TRUE(is_typed_path_pattern_type(PatternTypePB::MATCH_NAME)); + EXPECT_TRUE(is_typed_path_pattern_type(PatternTypePB::MATCH_NAME_GLOB)); + EXPECT_FALSE(is_typed_path_pattern_type(PatternTypePB::SKIP_NAME)); + EXPECT_FALSE(is_typed_path_pattern_type(PatternTypePB::SKIP_NAME_GLOB)); + + EXPECT_TRUE(is_skip_exact_path_pattern_type(PatternTypePB::SKIP_NAME)); + EXPECT_FALSE(is_skip_exact_path_pattern_type(PatternTypePB::SKIP_NAME_GLOB)); + EXPECT_TRUE(is_skip_glob_path_pattern_type(PatternTypePB::SKIP_NAME_GLOB)); + EXPECT_FALSE(is_skip_glob_path_pattern_type(PatternTypePB::MATCH_NAME_GLOB)); +} + +TEST(VariantUtilTest, BuildCompiledSkipMatcherRejectsNullOutPointer) { + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + }; + Status st = build_compiled_skip_matcher(skip_patterns, true, nullptr); + EXPECT_FALSE(st.ok()); +} + +TEST(VariantUtilTest, BuildCompiledSkipMatcherMixedPatterns) { + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + {"debug_*", PatternTypePB::SKIP_NAME_GLOB}, + {"[invalid", PatternTypePB::SKIP_NAME_GLOB}, + {"typed_*", PatternTypePB::MATCH_NAME_GLOB}, + }; + + std::shared_ptr matcher; + Status st = build_compiled_skip_matcher(skip_patterns, false, &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + ASSERT_TRUE(matcher != nullptr); + + EXPECT_TRUE(should_skip_path(*matcher, "secret")); + EXPECT_TRUE(should_skip_path(*matcher, "debug_field")); + EXPECT_FALSE(should_skip_path(*matcher, "typed_field")); + EXPECT_FALSE(should_skip_path(*matcher, "other")); +} + +TEST(VariantUtilTest, BuildCompiledSkipMatcherWithRe2Set) { + std::vector> skip_patterns; + for (int i = 0; i < 40; ++i) { + skip_patterns.emplace_back("k" + std::to_string(i) + "_*", PatternTypePB::SKIP_NAME_GLOB); + } + + std::shared_ptr matcher; + Status st = build_compiled_skip_matcher(skip_patterns, true, &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + ASSERT_TRUE(matcher != nullptr); + + EXPECT_TRUE(should_skip_path(*matcher, "k1_abc")); + EXPECT_TRUE(should_skip_path(*matcher, "k39_abc")); + EXPECT_FALSE(should_skip_path(*matcher, "unknown_abc")); +} + +TEST(VariantUtilTest, ParseVariantColumnsApplySkipPatternsFromSchemaChildren) { + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(KeysType::DUP_KEYS); + auto* c = schema_pb.add_column(); + c->set_unique_id(1); + c->set_name("v"); + c->set_type("VARIANT"); + c->set_is_key(false); + c->set_is_nullable(false); + c->set_variant_enable_doc_mode(false); + + // Typed path: should not be skipped. + auto* typed = c->add_children_columns(); + typed->set_unique_id(2); + typed->set_name("num_*"); + typed->set_type("BIGINT"); + typed->set_is_key(false); + typed->set_is_nullable(true); + typed->set_pattern_type(PatternTypePB::MATCH_NAME_GLOB); + + // Skip exact. + auto* skip_exact = c->add_children_columns(); + skip_exact->set_unique_id(3); + skip_exact->set_name("secret"); + skip_exact->set_type("STRING"); + skip_exact->set_is_key(false); + skip_exact->set_is_nullable(true); + skip_exact->set_pattern_type(PatternTypePB::SKIP_NAME); + + // Skip glob. + auto* skip_glob = c->add_children_columns(); + skip_glob->set_unique_id(4); + skip_glob->set_name("debug_*"); + skip_glob->set_type("STRING"); + skip_glob->set_is_key(false); + skip_glob->set_is_nullable(true); + skip_glob->set_pattern_type(PatternTypePB::SKIP_NAME_GLOB); + + TabletSchema tablet_schema; + tablet_schema.init_from_pb(schema_pb); + + auto variant = vectorized::ColumnVariant::create(0); + doris::VariantUtil::insert_root_scalar_field( + *variant, vectorized::Field::create_field( + String(R"({"secret":1,"debug_a":2,"keep":3,"num_a":4})"))); + doris::VariantUtil::insert_root_scalar_field( + *variant, vectorized::Field::create_field( + String(R"({"secret":5,"debug_b":6,"keep":7,"num_b":8})"))); + + vectorized::Block block; + block.insert({variant->get_ptr(), std::make_shared(0), "v"}); + + Status st = + parse_and_materialize_variant_columns(block, tablet_schema, std::vector {0}); + ASSERT_TRUE(st.ok()) << st.to_string(); + + const auto& col0 = *block.get_by_position(0).column; + const auto& out = assert_cast(col0); + + EXPECT_EQ(nullptr, out.get_subcolumn(vectorized::PathInData("secret"))); + EXPECT_EQ(nullptr, out.get_subcolumn(vectorized::PathInData("debug_a"))); + EXPECT_EQ(nullptr, out.get_subcolumn(vectorized::PathInData("debug_b"))); + + const auto* sub_keep = out.get_subcolumn(vectorized::PathInData("keep")); + const auto* sub_num_a = out.get_subcolumn(vectorized::PathInData("num_a")); + const auto* sub_num_b = out.get_subcolumn(vectorized::PathInData("num_b")); + ASSERT_TRUE(sub_keep != nullptr); + ASSERT_TRUE(sub_num_a != nullptr); + ASSERT_TRUE(sub_num_b != nullptr); +} + TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { if (std::getenv("DORIS_RUN_VARIANT_SKIP_PERF_UT") == nullptr) { GTEST_SKIP() << "Set DORIS_RUN_VARIANT_SKIP_PERF_UT=1 to run this heavy perf test."; diff --git a/be/test/olap/tablet_schema_test.cpp b/be/test/olap/tablet_schema_test.cpp index f5b53d494390ac..6ff0a24b22731f 100644 --- a/be/test/olap/tablet_schema_test.cpp +++ b/be/test/olap/tablet_schema_test.cpp @@ -102,6 +102,51 @@ TEST_F(TabletSchemaTest, test_tablet_column_init_from_thrift) { EXPECT_FALSE(tablet_column.variant_enable_typed_paths_to_sparse()); } +TEST_F(TabletSchemaTest, test_tablet_column_init_from_thrift_skip_pattern_type) { + auto check_pattern_type = [](TPatternType::type thrift_pattern_type, + PatternTypePB expected_pattern_type) { + TColumn tcolumn; + tcolumn.__set_column_name("thrift_column"); + tcolumn.__set_col_unique_id(1001); + TColumnType column_type; + column_type.__set_type(TPrimitiveType::STRING); + column_type.__set_len(255); + tcolumn.__set_column_type(column_type); + tcolumn.__set_is_key(false); + tcolumn.__set_is_allow_null(true); + tcolumn.__set_pattern_type(thrift_pattern_type); + + TabletColumn tablet_column; + tablet_column.init_from_thrift(tcolumn); + EXPECT_EQ(expected_pattern_type, tablet_column.pattern_type()); + }; + + check_pattern_type(TPatternType::SKIP_NAME, PatternTypePB::SKIP_NAME); + check_pattern_type(TPatternType::SKIP_NAME_GLOB, PatternTypePB::SKIP_NAME_GLOB); +} + +TEST_F(TabletSchemaTest, test_tablet_column_pattern_type_roundtrip_skip) { + ColumnPB column_pb; + column_pb.set_unique_id(2001); + column_pb.set_name("variant_skip_col"); + column_pb.set_type("STRING"); + column_pb.set_is_key(false); + column_pb.set_is_nullable(true); + column_pb.set_length(255); + column_pb.set_aggregation("NONE"); + column_pb.set_visible(true); + column_pb.set_pattern_type(PatternTypePB::SKIP_NAME_GLOB); + + TabletColumn tablet_column; + tablet_column.init_from_pb(column_pb); + EXPECT_EQ(PatternTypePB::SKIP_NAME_GLOB, tablet_column.pattern_type()); + + ColumnPB roundtrip_pb; + tablet_column.to_schema_pb(&roundtrip_pb); + EXPECT_TRUE(roundtrip_pb.has_pattern_type()); + EXPECT_EQ(PatternTypePB::SKIP_NAME_GLOB, roundtrip_pb.pattern_type()); +} + TEST_F(TabletSchemaTest, test_tablet_index_init_from_pb) { TabletIndexPB index_pb; index_pb.set_index_id(12345); diff --git a/be/test/vec/common/schema_util_test.cpp b/be/test/vec/common/schema_util_test.cpp index 1696130508137f..9e61dba67e175d 100644 --- a/be/test/vec/common/schema_util_test.cpp +++ b/be/test/vec/common/schema_util_test.cpp @@ -902,9 +902,9 @@ TEST_F(SchemaUtilTest, TestUpdateLeastSchemaInternal) { PathInData single_path("test_variant.c"); subcolumns_types[single_path] = {std::make_shared()}; - std::map typed_columns; - Status st = - variant_util::update_least_schema_internal(subcolumns_types, schema, 1, typed_columns); + std::map typed_path_columns; + Status st = variant_util::update_least_schema_internal(subcolumns_types, schema, 1, + typed_path_columns); EXPECT_TRUE(st.ok()); // Check results diff --git a/be/test/vec/jsonb/json_parser_test.cpp b/be/test/vec/jsonb/json_parser_test.cpp index e4790f6786c16a..a4f353fe9097f6 100644 --- a/be/test/vec/jsonb/json_parser_test.cpp +++ b/be/test/vec/jsonb/json_parser_test.cpp @@ -19,14 +19,26 @@ #include +#include +#include #include #include "common/config.h" #include "vec/common/string_ref.h" +#include "vec/common/variant_util.h" using doris::vectorized::JSONDataParser; using doris::vectorized::SimdJSONParser; using doris::vectorized::ParseConfig; +using doris::PatternTypePB; + +static std::set collect_paths(const doris::vectorized::ParseResult& result) { + std::set paths; + for (const auto& path : result.paths) { + paths.insert(path.get_path()); + } + return paths; +} TEST(JsonParserTest, ParseSimpleTypes) { JSONDataParser parser; @@ -474,3 +486,85 @@ TEST(JsonParserTest, KeyLengthLimitByConfig) { EXPECT_EQ(result->values[0].get_type(), doris::PrimitiveType::TYPE_JSONB); } } + +TEST(JsonParserTest, ParseWithSkipPatternsLegacyAndCompiledMatcher) { + JSONDataParser parser; + + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + {"debug_*", PatternTypePB::SKIP_NAME_GLOB}, + }; + + std::string json = R"({"secret":1,"debug_x":2,"keep":3})"; + ParseConfig legacy_config; + legacy_config.skip_path_patterns = &skip_patterns; + auto legacy_result = parser.parse(json.c_str(), json.size(), legacy_config); + ASSERT_TRUE(legacy_result.has_value()); + std::set legacy_paths = collect_paths(legacy_result.value()); + EXPECT_EQ(legacy_paths.find("secret"), legacy_paths.end()); + EXPECT_EQ(legacy_paths.find("debug_x"), legacy_paths.end()); + EXPECT_NE(legacy_paths.find("keep"), legacy_paths.end()); + + std::shared_ptr matcher; + auto st = doris::vectorized::variant_util::build_compiled_skip_matcher(skip_patterns, true, + &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + + ParseConfig compiled_config; + compiled_config.skip_path_patterns = &skip_patterns; + compiled_config.compiled_skip_matcher = matcher; + compiled_config.skip_result_cache_capacity = 8; + auto compiled_result = parser.parse(json.c_str(), json.size(), compiled_config); + ASSERT_TRUE(compiled_result.has_value()); + std::set compiled_paths = collect_paths(compiled_result.value()); + EXPECT_EQ(legacy_paths, compiled_paths); +} + +TEST(JsonParserTest, ParseWithInvalidSkipGlobDoesNotDropPaths) { + JSONDataParser parser; + std::vector> skip_patterns = { + {"[invalid", PatternTypePB::SKIP_NAME_GLOB}, + }; + std::string json = R"({"invalid":1,"keep":2})"; + + ParseConfig config; + config.skip_path_patterns = &skip_patterns; + auto result = parser.parse(json.c_str(), json.size(), config); + ASSERT_TRUE(result.has_value()); + std::set paths = collect_paths(result.value()); + EXPECT_NE(paths.find("invalid"), paths.end()); + EXPECT_NE(paths.find("keep"), paths.end()); + + std::shared_ptr matcher; + auto st = doris::vectorized::variant_util::build_compiled_skip_matcher(skip_patterns, true, + &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + + ParseConfig compiled_config; + compiled_config.skip_path_patterns = &skip_patterns; + compiled_config.compiled_skip_matcher = matcher; + auto compiled_result = parser.parse(json.c_str(), json.size(), compiled_config); + ASSERT_TRUE(compiled_result.has_value()); + std::set compiled_paths = collect_paths(compiled_result.value()); + EXPECT_NE(compiled_paths.find("invalid"), compiled_paths.end()); + EXPECT_NE(compiled_paths.find("keep"), compiled_paths.end()); +} + +TEST(JsonParserTest, SkipRulesDoNotApplyInsideArrayElements) { + JSONDataParser parser; + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + }; + std::string json = R"([{"secret":1,"keep":2},{"secret":3,"keep":4}])"; + + ParseConfig config; + config.enable_flatten_nested = true; + config.skip_path_patterns = &skip_patterns; + auto result = parser.parse(json.c_str(), json.size(), config); + ASSERT_TRUE(result.has_value()); + + std::set paths = collect_paths(result.value()); + // Skip is disabled in traverseArrayElement; element object paths should remain. + EXPECT_NE(paths.find("secret"), paths.end()); + EXPECT_NE(paths.find("keep"), paths.end()); +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/ColumnTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/ColumnTest.java index cde493d4adb409..6fccdec7a71884 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/ColumnTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/ColumnTest.java @@ -28,7 +28,12 @@ import org.apache.doris.common.io.Text; import org.apache.doris.common.jmockit.Deencapsulation; import org.apache.doris.persist.gson.GsonUtils; +import org.apache.doris.proto.OlapFile; +import org.apache.doris.thrift.TColumn; +import org.apache.doris.thrift.TPatternType; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -37,6 +42,7 @@ import java.io.DataOutputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; public class ColumnTest { @@ -173,4 +179,61 @@ public void testBaseColumn() { mvColumnComplex.setDefineExpr(add); Assert.assertTrue(mvColumnComplex.tryGetBaseColumnName().equalsIgnoreCase("mv_b")); } + + @Test + public void testVariantSkipPatternChildrenSerialization() throws Exception { + ArrayList variantPathPatterns = new ArrayList<>(); + // Deliberately interleave skip and typed paths to verify grouped output order. + variantPathPatterns.add(new VariantField("debug_*", Type.STRING, "", TPatternType.SKIP_NAME_GLOB)); + variantPathPatterns.add(new VariantField("num_*", Type.BIGINT, "", TPatternType.MATCH_NAME_GLOB)); + variantPathPatterns.add(new VariantField("secret", Type.STRING, "", TPatternType.SKIP_NAME)); + variantPathPatterns.add(new VariantField("id", Type.INT, "", TPatternType.MATCH_NAME)); + VariantType variantType = new VariantType(variantPathPatterns); + + Column variantColumn = new Column("v", variantType, true); + Assert.assertEquals(4, variantColumn.getChildren().size()); + Assert.assertEquals(2, variantColumn.getVariantTypedPathChildrenOrEmpty().size()); + Assert.assertEquals(2, variantColumn.getVariantSkipPatternChildrenOrEmpty().size()); + + TColumn thriftColumn = variantColumn.toThrift(); + Assert.assertNotNull(thriftColumn.getChildrenColumn()); + Assert.assertEquals(4, thriftColumn.getChildrenColumnSize()); + Assert.assertEquals("num_*", thriftColumn.getChildrenColumn().get(0).getColumnName()); + Assert.assertEquals(TPatternType.MATCH_NAME_GLOB, thriftColumn.getChildrenColumn().get(0).getPatternType()); + Assert.assertEquals("id", thriftColumn.getChildrenColumn().get(1).getColumnName()); + Assert.assertEquals(TPatternType.MATCH_NAME, thriftColumn.getChildrenColumn().get(1).getPatternType()); + Assert.assertEquals("debug_*", thriftColumn.getChildrenColumn().get(2).getColumnName()); + Assert.assertEquals(TPatternType.SKIP_NAME_GLOB, thriftColumn.getChildrenColumn().get(2).getPatternType()); + Assert.assertEquals("secret", thriftColumn.getChildrenColumn().get(3).getColumnName()); + Assert.assertEquals(TPatternType.SKIP_NAME, thriftColumn.getChildrenColumn().get(3).getPatternType()); + + OlapFile.ColumnPB pbColumn = variantColumn.toPb(Sets.newHashSet(), Lists.newArrayList()); + Assert.assertEquals(4, pbColumn.getChildrenColumnsCount()); + Assert.assertEquals("num_*", pbColumn.getChildrenColumns(0).getName()); + Assert.assertEquals(OlapFile.PatternTypePB.MATCH_NAME_GLOB, + pbColumn.getChildrenColumns(0).getPatternType()); + Assert.assertEquals("id", pbColumn.getChildrenColumns(1).getName()); + Assert.assertEquals(OlapFile.PatternTypePB.MATCH_NAME, pbColumn.getChildrenColumns(1).getPatternType()); + Assert.assertEquals("debug_*", pbColumn.getChildrenColumns(2).getName()); + Assert.assertEquals(OlapFile.PatternTypePB.SKIP_NAME_GLOB, + pbColumn.getChildrenColumns(2).getPatternType()); + Assert.assertEquals("secret", pbColumn.getChildrenColumns(3).getName()); + Assert.assertEquals(OlapFile.PatternTypePB.SKIP_NAME, pbColumn.getChildrenColumns(3).getPatternType()); + } + + @Test + public void testVariantSchemaChangeRejectsSkipPatternMutation() { + ArrayList oldPatterns = new ArrayList<>(); + oldPatterns.add(new VariantField("secret", Type.STRING, "", TPatternType.SKIP_NAME)); + Column oldColumn = new Column("v", new VariantType(oldPatterns), true); + + Column newColumn = new Column("v", new VariantType(new ArrayList<>()), true); + + try { + oldColumn.checkSchemaChangeAllowed(newColumn); + Assert.fail("No exception throws."); + } catch (DdlException e) { + Assert.assertTrue(e.getMessage().contains("Can not change variant skip patterns")); + } + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/TypeTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/TypeTest.java index fe3e2b0bd0a2fa..7033bf9a910f45 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/TypeTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/TypeTest.java @@ -17,6 +17,8 @@ package org.apache.doris.catalog; +import org.apache.doris.thrift.TPatternType; + import org.junit.Assert; import org.junit.Test; @@ -128,6 +130,43 @@ public void testVariantPredefinedFieldsExactMatch() { Assert.assertFalse(Type.matchExactType(v1, v4, false)); } + @Test + public void testVariantSkipPatternsIgnoredInExactMatch() { + ArrayList fields1 = new ArrayList<>(); + fields1.add(new VariantField("typed_a", Type.INT, "", TPatternType.MATCH_NAME)); + fields1.add(new VariantField("debug_*", Type.STRING, "", TPatternType.SKIP_NAME_GLOB)); + fields1.add(new VariantField("secret", Type.STRING, "", TPatternType.SKIP_NAME)); + VariantType v1 = new VariantType(fields1); + + ArrayList fields2 = new ArrayList<>(); + fields2.add(new VariantField("typed_b", Type.INT, "", TPatternType.MATCH_NAME)); + fields2.add(new VariantField("tmp_*", Type.STRING, "", TPatternType.SKIP_NAME_GLOB)); + fields2.add(new VariantField("pwd", Type.STRING, "", TPatternType.SKIP_NAME)); + VariantType v2 = new VariantType(fields2); + + // Exact type check should only compare typed-path fields and ignore skip patterns. + Assert.assertTrue(Type.matchExactType(v1, v2, false)); + + ArrayList fields3 = new ArrayList<>(); + fields3.add(new VariantField("typed_a", Type.BIGINT, "", TPatternType.MATCH_NAME)); + fields3.add(new VariantField("debug_*", Type.STRING, "", TPatternType.SKIP_NAME_GLOB)); + VariantType v3 = new VariantType(fields3); + Assert.assertFalse(Type.matchExactType(v1, v3, false)); + } + + @Test + public void testVariantFieldSkipSqlAndMatchesField() { + VariantField skipExact = new VariantField("secret", Type.STRING, "", TPatternType.SKIP_NAME); + VariantField skipGlob = new VariantField("debug_*", Type.STRING, "", TPatternType.SKIP_NAME_GLOB); + VariantField typed = new VariantField("secret", Type.STRING, "", TPatternType.MATCH_NAME); + + Assert.assertEquals("SKIP MATCH_NAME 'secret'", skipExact.toSql(0)); + Assert.assertEquals("SKIP 'debug_*'", skipGlob.toSql(0)); + Assert.assertFalse(skipExact.matchesField(typed)); + Assert.assertFalse(typed.matchesField(skipExact)); + Assert.assertFalse(skipExact.equals(typed)); + } + // ===================== Mixed Nesting & Precision ===================== @Test public void testArrayMapStructCombinationWithPrecision() { diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java index 1263cc7e95d877..6f97db27cff8fe 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java @@ -47,6 +47,8 @@ import org.apache.doris.nereids.trees.plans.commands.ExplainCommand; import org.apache.doris.nereids.trees.plans.commands.ExplainCommand.ExplainLevel; import org.apache.doris.nereids.trees.plans.commands.ReplayCommand; +import org.apache.doris.nereids.trees.plans.commands.info.ColumnDefinition; +import org.apache.doris.nereids.trees.plans.commands.info.CreateTableInfo; import org.apache.doris.nereids.trees.plans.commands.merge.MergeIntoCommand; import org.apache.doris.nereids.trees.plans.logical.LogicalAggregate; import org.apache.doris.nereids.trees.plans.logical.LogicalCTE; @@ -61,10 +63,14 @@ import org.apache.doris.nereids.types.DateType; import org.apache.doris.nereids.types.DecimalV2Type; import org.apache.doris.nereids.types.DecimalV3Type; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VariantField; +import org.apache.doris.nereids.types.VariantType; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.GlobalVariable; import org.apache.doris.qe.SqlModeHelper; import org.apache.doris.qe.StmtExecutor; +import org.apache.doris.thrift.TPatternType; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; @@ -1515,6 +1521,75 @@ public void testMergeInto() throws Exception { Assertions.assertThrows(ParseException.class, () -> parser.parseSingle(invalidSql4)); } + @Test + public void testParseVariantSkipPatternsInCreateTable() { + NereidsParser parser = new NereidsParser(); + String sql = "create table t_skip_parse (\n" + + " id int,\n" + + " v variant\n" + + ")\n" + + "duplicate key(id)\n" + + "distributed by hash(id) buckets 1\n" + + "properties('replication_num'='1')"; + LogicalPlan logicalPlan = parser.parseSingle(sql); + Assertions.assertInstanceOf(CreateTableCommand.class, logicalPlan); + + CreateTableInfo createTableInfo = ((CreateTableCommand) logicalPlan).getCreateTableInfo(); + ColumnDefinition variantColumn = createTableInfo.getColumnDefinitions().stream() + .filter(c -> "v".equalsIgnoreCase(c.getName())) + .findFirst() + .orElseThrow(() -> new AssertionError("variant column not found")); + + Assertions.assertTrue(variantColumn.getType() instanceof VariantType); + VariantType variantType = (VariantType) variantColumn.getType(); + List variantPathPatterns = variantType.getVariantPathPatterns(); + Assertions.assertEquals(3, variantPathPatterns.size()); + + VariantField skipGlob = variantPathPatterns.get(0); + Assertions.assertTrue(skipGlob.isSkipPatternType()); + Assertions.assertEquals(StringType.INSTANCE, skipGlob.getDataType()); + Assertions.assertEquals(TPatternType.SKIP_NAME_GLOB, skipGlob.toCatalogDataType().getPatternType()); + Assertions.assertEquals("SKIP 'debug_*'", skipGlob.toSql()); + + VariantField skipExact = variantPathPatterns.get(1); + Assertions.assertTrue(skipExact.isSkipPatternType()); + Assertions.assertEquals(StringType.INSTANCE, skipExact.getDataType()); + Assertions.assertEquals(TPatternType.SKIP_NAME, skipExact.toCatalogDataType().getPatternType()); + Assertions.assertEquals("SKIP MATCH_NAME 'secret'", skipExact.toSql()); + + VariantField typedPattern = variantPathPatterns.get(2); + Assertions.assertTrue(typedPattern.isTypedPathPatternType()); + Assertions.assertEquals(TPatternType.MATCH_NAME_GLOB, typedPattern.toCatalogDataType().getPatternType()); + Assertions.assertTrue(typedPattern.matches("num_a")); + Assertions.assertEquals(1, variantType.getVariantTypedPathPatterns().size()); + } + + @Test + public void testParseVariantSkipOnlyWithDocMode() { + NereidsParser parser = new NereidsParser(); + String sql = "create table t_skip_doc_mode (\n" + + " id int,\n" + + " v variant\n" + + ")\n" + + "duplicate key(id)\n" + + "distributed by hash(id) buckets 1\n" + + "properties('replication_num'='1')"; + LogicalPlan logicalPlan = parser.parseSingle(sql); + Assertions.assertInstanceOf(CreateTableCommand.class, logicalPlan); + + CreateTableInfo createTableInfo = ((CreateTableCommand) logicalPlan).getCreateTableInfo(); + ColumnDefinition variantColumn = createTableInfo.getColumnDefinitions().stream() + .filter(c -> "v".equalsIgnoreCase(c.getName())) + .findFirst() + .orElseThrow(() -> new AssertionError("variant column not found")); + + VariantType variantType = (VariantType) variantColumn.getType(); + Assertions.assertTrue(variantType.getEnableVariantDocMode()); + Assertions.assertEquals(1, variantType.getVariantPathPatterns().size()); + Assertions.assertEquals(0, variantType.getVariantTypedPathPatterns().size()); + Assertions.assertTrue(variantType.getVariantPathPatterns().get(0).isSkipPatternType()); + } + @Test public void testUnnest() { String sql = "SELECT t.* FROM LATERAL unnest([1,2], ['hi','hello']) WITH ORDINALITY AS t(c1,c2);"; diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java index 66289238e86414..6be411de9fd357 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java @@ -362,4 +362,70 @@ public void testGlobCharacterClass() { Assertions.assertFalse(field2.matches("int_1")); } + + @Test + public void testSkipPatternFlagsAndSql() { + VariantField skipExact = new VariantField("secret", StringType.INSTANCE, "", + TPatternType.SKIP_NAME.name()); + Assertions.assertTrue(skipExact.isSkipPatternType()); + Assertions.assertFalse(skipExact.isTypedPathPatternType()); + Assertions.assertEquals("SKIP MATCH_NAME 'secret'", skipExact.toSql()); + Assertions.assertEquals(TPatternType.SKIP_NAME, skipExact.toCatalogDataType().getPatternType()); + + VariantField skipGlob = new VariantField("debug_*", StringType.INSTANCE, "", + TPatternType.SKIP_NAME_GLOB.name()); + Assertions.assertTrue(skipGlob.isSkipPatternType()); + Assertions.assertFalse(skipGlob.isTypedPathPatternType()); + Assertions.assertEquals("SKIP 'debug_*'", skipGlob.toSql()); + Assertions.assertEquals(TPatternType.SKIP_NAME_GLOB, skipGlob.toCatalogDataType().getPatternType()); + } + + @Test + public void testSkipPatternNeverMatches() { + VariantField skipExact = new VariantField("secret", StringType.INSTANCE, "", + TPatternType.SKIP_NAME.name()); + VariantField skipGlob = new VariantField("debug_*", StringType.INSTANCE, "", + TPatternType.SKIP_NAME_GLOB.name()); + Assertions.assertFalse(skipExact.matches("secret")); + Assertions.assertFalse(skipGlob.matches("debug_x")); + } + + @Test + public void testFindMatchingFieldIgnoresSkipPatterns() { + VariantField skip = new VariantField("debug_*", StringType.INSTANCE, "", + TPatternType.SKIP_NAME_GLOB.name()); + VariantField typed = new VariantField("num_*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + VariantType variantType = new VariantType(ImmutableList.of(skip, typed)); + + Assertions.assertFalse(variantType.findMatchingField("debug_x").isPresent()); + Optional result = variantType.findMatchingField("num_a"); + Assertions.assertTrue(result.isPresent()); + Assertions.assertEquals(BigIntType.INSTANCE, result.get().getDataType()); + } + + @Test + public void testEqualsAndHashCodeIncludePatternType() { + VariantField typed = new VariantField("a", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + VariantField skip = new VariantField("a", BigIntType.INSTANCE, "", + TPatternType.SKIP_NAME_GLOB.name()); + + Assertions.assertNotEquals(typed, skip); + Assertions.assertNotEquals(typed.hashCode(), skip.hashCode()); + } + + @Test + public void testGetVariantTypedPathPatternsFiltersSkipPatterns() { + VariantField skip = new VariantField("debug_*", StringType.INSTANCE, "", + TPatternType.SKIP_NAME_GLOB.name()); + VariantField typed1 = new VariantField("num_*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + VariantField typed2 = new VariantField("id", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME.name()); + VariantType variantType = new VariantType(ImmutableList.of(skip, typed1, typed2)); + + Assertions.assertEquals(3, variantType.getVariantPathPatterns().size()); + Assertions.assertEquals(2, variantType.getVariantTypedPathPatterns().size()); + } } diff --git a/regression-test/data/variant_p0/predefine/test_schema_template_skip.out b/regression-test/data/variant_p0/predefine/test_schema_template_skip.out index ec191be3265da6..37f9345d5d42c7 100644 --- a/regression-test/data/variant_p0/predefine/test_schema_template_skip.out +++ b/regression-test/data/variant_p0/predefine/test_schema_template_skip.out @@ -39,6 +39,12 @@ -- !skip_priority_2 -- 1 val +-- !skip_invalid_glob_1 -- +1 x + +-- !skip_invalid_glob_2 -- +1 y + -- !skip_glob_cross_1 -- 1 \N diff --git a/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy b/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy index e1a907688b16cc..95a03c040c7363 100644 --- a/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy +++ b/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy @@ -96,6 +96,10 @@ suite("test_schema_template_skip", "p0") { DISTRIBUTED BY HASH(`id`) BUCKETS 1 PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + sql """insert into ${tableName5} values(1, '{"i":"x","invalid":"y"}')""" + qt_skip_invalid_glob_1 """ SELECT id, data['i'] FROM ${tableName5} ORDER BY id """ + qt_skip_invalid_glob_2 """ SELECT id, data['invalid'] FROM ${tableName5} ORDER BY id """ + // Test 6: Glob cross-level matching — pattern spans nested path def tableName6 = "test_skip_glob_cross_level" sql "DROP TABLE IF EXISTS ${tableName6}" From 1c44ebafc1ae24f5ad517947035f204d24699cb8 Mon Sep 17 00:00:00 2001 From: Gary Date: Fri, 13 Feb 2026 01:02:16 +0800 Subject: [PATCH 5/8] reduce test size --- .../rowset/segment_v2/variant_util_test.cpp | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/be/test/olap/rowset/segment_v2/variant_util_test.cpp b/be/test/olap/rowset/segment_v2/variant_util_test.cpp index 5568b88ffda25e..870bf98be07ad5 100644 --- a/be/test/olap/rowset/segment_v2/variant_util_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_util_test.cpp @@ -54,11 +54,15 @@ static uint64_t _splitmix64(uint64_t x) { return x ^ (x >> 31); } +static constexpr size_t kPerfNestedDim = 8; +static constexpr size_t kPerfNestedLeafCount = + kPerfNestedDim * kPerfNestedDim * kPerfNestedDim * kPerfNestedDim; + static std::string _path_of_leaf_id(size_t leaf_id) { - const size_t g = leaf_id / 1000; - const size_t s = (leaf_id / 100) % 10; - const size_t t = (leaf_id / 10) % 10; - const size_t k = leaf_id % 10; + const size_t g = leaf_id / (kPerfNestedDim * kPerfNestedDim * kPerfNestedDim); + const size_t s = (leaf_id / (kPerfNestedDim * kPerfNestedDim)) % kPerfNestedDim; + const size_t t = (leaf_id / kPerfNestedDim) % kPerfNestedDim; + const size_t k = leaf_id % kPerfNestedDim; std::string path; path.reserve(16); path += "g"; @@ -77,21 +81,22 @@ static std::string _build_nested_json_row(size_t row_idx, uint64_t seed) { root.reserve(220000); root.push_back('{'); bool first_g = true; - for (size_t g = 0; g < 10; ++g) { + for (size_t g = 0; g < kPerfNestedDim; ++g) { std::string g_obj; g_obj.push_back('{'); bool first_s = true; - for (size_t s = 0; s < 10; ++s) { + for (size_t s = 0; s < kPerfNestedDim; ++s) { std::string s_obj; s_obj.push_back('{'); bool first_t = true; - for (size_t t = 0; t < 10; ++t) { + for (size_t t = 0; t < kPerfNestedDim; ++t) { std::string t_obj; t_obj.push_back('{'); bool first_k = true; - for (size_t k = 0; k < 10; ++k) { - const size_t leaf_id = ((g * 10 + s) * 10 + t) * 10 + k; - // Keep 10k nested columns per row to stress skip-pattern matching. + for (size_t k = 0; k < kPerfNestedDim; ++k) { + const size_t leaf_id = + ((g * kPerfNestedDim + s) * kPerfNestedDim + t) * kPerfNestedDim + k; + // Keep many nested columns per row to stress skip-pattern matching. if (!first_k) { t_obj.push_back(','); } @@ -171,7 +176,7 @@ static std::vector> _build_skip_patterns_f patterns.reserve(96); // Exact match patterns. - for (size_t leaf_id = 0; leaf_id < 10000; leaf_id += 211) { + for (size_t leaf_id = 0; leaf_id < kPerfNestedLeafCount; leaf_id += 211) { patterns.emplace_back(_path_of_leaf_id(leaf_id), PatternTypePB::SKIP_NAME); } @@ -182,7 +187,7 @@ static std::vector> _build_skip_patterns_f } // Matched glob patterns. - for (size_t g = 0; g < 10; ++g) { + for (size_t g = 0; g < kPerfNestedDim; ++g) { std::string pattern = "g"; pattern.push_back(static_cast('0' + g)); pattern += ".s?.t?.k[02468]"; @@ -652,7 +657,7 @@ TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { GTEST_SKIP() << "Set DORIS_RUN_VARIANT_SKIP_PERF_UT=1 to run this heavy perf test."; } - constexpr size_t kRows = 1000; + constexpr size_t kRows = 200; constexpr uint64_t kSeed = 0x20260211ULL; const auto json_rows = _build_nested_json_rows(kRows, kSeed); const auto json_column = _make_json_column(json_rows); @@ -716,7 +721,8 @@ TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { static_cast(optimized_result.elapsed_ms) : 0.0; - LOG(INFO) << "skip-pattern perf compare (1000 rows, 10k nested columns, same random data): " + LOG(INFO) << "skip-pattern perf compare (" << kRows << " rows, " << kPerfNestedLeafCount + << " nested columns, same random data): " << "no_skip_ms=" << no_skip_result.elapsed_ms << ", " << "legacy_ms=" << legacy_result.elapsed_ms << ", optimized_ms=" << optimized_result.elapsed_ms From 27dd69906bc9ea5d8ae8c58fe656d383052ae1ce Mon Sep 17 00:00:00 2001 From: Gary Date: Fri, 13 Feb 2026 22:13:49 +0800 Subject: [PATCH 6/8] cross-row cache --- be/src/vec/common/variant_util.cpp | 1 + be/src/vec/json/json_parser.cpp | 142 +++++++++++++++--- be/src/vec/json/json_parser.h | 51 +++++-- .../rowset/segment_v2/variant_util_test.cpp | 104 ++++++++++++- 4 files changed, 267 insertions(+), 31 deletions(-) diff --git a/be/src/vec/common/variant_util.cpp b/be/src/vec/common/variant_util.cpp index 41d06e84ddc38f..273c27c48830be 100644 --- a/be/src/vec/common/variant_util.cpp +++ b/be/src/vec/common/variant_util.cpp @@ -2196,6 +2196,7 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t configs[i].skip_path_patterns = &variant_skip_path_patterns[i]; RETURN_IF_ERROR(build_compiled_skip_matcher(variant_skip_path_patterns[i], true, &configs[i].compiled_skip_matcher)); + configs[i].adaptive_skip_result_cache_capacity = true; } // if doc mode is not enabled, no need to parse to doc value column if (!column.variant_enable_doc_mode()) { diff --git a/be/src/vec/json/json_parser.cpp b/be/src/vec/json/json_parser.cpp index 150bb7f632f91c..a188c986a59244 100644 --- a/be/src/vec/json/json_parser.cpp +++ b/be/src/vec/json/json_parser.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include "common/cast_set.h" @@ -37,6 +38,67 @@ namespace doris::vectorized { #include "common/compile_check_begin.h" +constexpr size_t kAdaptiveSkipCacheMaxCapacity = 1UL << 14; // 16384 + +template +void JSONDataParser::reset_skip_cache() { + skip_cache.clear(); + skip_cache_lru.clear(); +} + +template +void JSONDataParser::prepare_skip_cache(const ParseConfig& config, + ParseContext& context) { + const bool has_skip_path_patterns = + context.skip_matcher != nullptr || + (context.skip_path_patterns != nullptr && !context.skip_path_patterns->empty()); + if (!has_skip_path_patterns || context.skip_result_cache_capacity == 0) { + reset_skip_cache(); + skip_cache_matcher_holder.reset(); + skip_cache_patterns = nullptr; + skip_cache_config_capacity = 0; + skip_cache_adaptive = false; + skip_cache_learned_capacity = 0; + context.skip_cache = nullptr; + context.skip_cache_lru = nullptr; + context.skip_cache_unbounded = false; + return; + } + + const bool adaptive = config.adaptive_skip_result_cache_capacity; + const bool matcher_changed = context.skip_matcher != nullptr + ? skip_cache_matcher_holder.get() != context.skip_matcher + : skip_cache_matcher_holder != nullptr; + const bool patterns_changed = + context.skip_matcher == nullptr && skip_cache_patterns != context.skip_path_patterns; + const bool cache_config_changed = + matcher_changed || patterns_changed || + skip_cache_config_capacity != context.skip_result_cache_capacity || + skip_cache_adaptive != adaptive; + if (cache_config_changed) { + reset_skip_cache(); + skip_cache_matcher_holder = config.compiled_skip_matcher; + skip_cache_patterns = context.skip_path_patterns; + skip_cache_config_capacity = context.skip_result_cache_capacity; + skip_cache_adaptive = adaptive; + skip_cache_learned_capacity = 0; + if (!skip_cache_adaptive) { + skip_cache.reserve(skip_cache_config_capacity); + } + } + context.skip_cache = &skip_cache; + context.skip_cache_lru = &skip_cache_lru; + context.skip_cache_unbounded = false; + if (skip_cache_adaptive) { + if (skip_cache_learned_capacity == 0) { + context.skip_cache_unbounded = true; + } else { + context.skip_result_cache_capacity = static_cast( + std::min(skip_cache_learned_capacity, kAdaptiveSkipCacheMaxCapacity)); + } + } +} + template std::optional JSONDataParser::parse(const char* begin, size_t length, const ParseConfig& config) { @@ -50,12 +112,19 @@ std::optional JSONDataParser::parse(const char* begin, context.skip_path_patterns = config.skip_path_patterns; context.skip_matcher = config.compiled_skip_matcher.get(); context.skip_result_cache_capacity = config.skip_result_cache_capacity; - if (context.skip_result_cache_capacity > 0 && - (context.skip_matcher != nullptr || - (context.skip_path_patterns != nullptr && !context.skip_path_patterns->empty()))) { - context.skip_cache.reserve(context.skip_result_cache_capacity); - } +#ifdef BE_TEST + context.skip_cache_stats = config.skip_cache_stats; +#endif + prepare_skip_cache(config, context); traverse(document, context); + if (skip_cache_adaptive && context.skip_cache != nullptr && context.skip_cache_unbounded && + !skip_cache.empty()) { + const size_t learned_capacity = std::min(skip_cache.size(), kAdaptiveSkipCacheMaxCapacity); + if (learned_capacity > 0) { + const size_t rounded_capacity = std::bit_ceil(learned_capacity); + skip_cache_learned_capacity = std::min(rounded_capacity, kAdaptiveSkipCacheMaxCapacity); + } + } ParseResult result; result.values = std::move(context.values); result.paths.reserve(context.paths.size()); @@ -125,13 +194,28 @@ void JSONDataParser::traverseObject(const JSONObject& object, ParseC ctx.current_path.append(key.data(), key.size()); bool is_skipped = false; - if (ctx.skip_result_cache_capacity > 0) { - auto cache_it = ctx.skip_cache.find(ctx.current_path); - if (cache_it != ctx.skip_cache.end()) { + if (ctx.skip_cache != nullptr) { +#ifdef BE_TEST + if (ctx.skip_cache_stats != nullptr) { + ++ctx.skip_cache_stats->lookup_count; + } +#endif + auto cache_it = ctx.skip_cache->find(ctx.current_path); + if (cache_it != ctx.skip_cache->end()) { is_skipped = cache_it->second.is_skipped; - ctx.skip_cache_lru.splice(ctx.skip_cache_lru.begin(), ctx.skip_cache_lru, - cache_it->second.lru_it); + ctx.skip_cache_lru->splice(ctx.skip_cache_lru->begin(), *ctx.skip_cache_lru, + cache_it->second.lru_it); +#ifdef BE_TEST + if (ctx.skip_cache_stats != nullptr) { + ++ctx.skip_cache_stats->hit_count; + } +#endif } else { +#ifdef BE_TEST + if (ctx.skip_cache_stats != nullptr) { + ++ctx.skip_cache_stats->miss_count; + } +#endif if (ctx.skip_matcher != nullptr) { is_skipped = variant_util::should_skip_path(*ctx.skip_matcher, ctx.current_path); @@ -140,19 +224,31 @@ void JSONDataParser::traverseObject(const JSONObject& object, ParseC ctx.current_path); } - if (ctx.skip_cache.size() >= ctx.skip_result_cache_capacity && - !ctx.skip_cache_lru.empty()) { - const auto& evicted_key = ctx.skip_cache_lru.back(); - ctx.skip_cache.erase(evicted_key); - ctx.skip_cache_lru.pop_back(); + const size_t cache_capacity = ctx.skip_cache_unbounded + ? kAdaptiveSkipCacheMaxCapacity + : ctx.skip_result_cache_capacity; + if (ctx.skip_cache->size() >= cache_capacity && !ctx.skip_cache_lru->empty()) { + const auto& evicted_key = ctx.skip_cache_lru->back(); + ctx.skip_cache->erase(evicted_key); + ctx.skip_cache_lru->pop_back(); +#ifdef BE_TEST + if (ctx.skip_cache_stats != nullptr) { + ++ctx.skip_cache_stats->evict_count; + } +#endif } - std::string cache_key(ctx.current_path); - ctx.skip_cache_lru.push_front(cache_key); - typename ParseContext::SkipCacheEntry cache_entry; + ctx.skip_cache_lru->push_front(ctx.current_path); + SkipCacheEntry cache_entry; cache_entry.is_skipped = is_skipped; - cache_entry.lru_it = ctx.skip_cache_lru.begin(); - ctx.skip_cache.emplace(std::move(cache_key), std::move(cache_entry)); + cache_entry.lru_it = ctx.skip_cache_lru->begin(); + ctx.skip_cache->emplace(std::string_view(*cache_entry.lru_it), + std::move(cache_entry)); +#ifdef BE_TEST + if (ctx.skip_cache_stats != nullptr) { + ++ctx.skip_cache_stats->insert_count; + } +#endif } } else if (ctx.skip_matcher != nullptr) { is_skipped = variant_util::should_skip_path(*ctx.skip_matcher, ctx.current_path); @@ -280,6 +376,12 @@ void JSONDataParser::traverseArrayElement(const Element& element, element_ctx.skip_path_patterns = nullptr; element_ctx.skip_matcher = nullptr; element_ctx.skip_result_cache_capacity = 0; + element_ctx.skip_cache_unbounded = false; + element_ctx.skip_cache = nullptr; + element_ctx.skip_cache_lru = nullptr; +#ifdef BE_TEST + element_ctx.skip_cache_stats = nullptr; +#endif traverse(element, element_ctx); auto& paths = element_ctx.paths; auto& values = element_ctx.values; diff --git a/be/src/vec/json/json_parser.h b/be/src/vec/json/json_parser.h index 890288b451f13a..ccf7e0ba08b55f 100644 --- a/be/src/vec/json/json_parser.h +++ b/be/src/vec/json/json_parser.h @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -105,6 +106,16 @@ void writeValueAsJsonb(const Element& element, JsonbWriter& writer) { } } +#ifdef BE_TEST +struct SkipCacheStats { + uint64_t lookup_count = 0; + uint64_t hit_count = 0; + uint64_t miss_count = 0; + uint64_t insert_count = 0; + uint64_t evict_count = 0; +}; +#endif + struct ParseConfig { bool enable_flatten_nested = false; enum class ParseTo { @@ -117,8 +128,15 @@ struct ParseConfig { const std::vector>* skip_path_patterns = nullptr; // pre-compiled skip matcher for hot parsing path std::shared_ptr compiled_skip_matcher = nullptr; - // per-parse cache size for "path -> skip result", 0 means disabled + // max entries for "path -> skip result" cache, 0 means disabled uint16_t skip_result_cache_capacity = 256; + // if true, first effective row learns cache size (capped at 16384) and rounds it up to a + // power of two, then reuses learned size. + bool adaptive_skip_result_cache_capacity = false; +#ifdef BE_TEST + // optional cache stats for tests/observability + SkipCacheStats* skip_cache_stats = nullptr; +#endif }; /// Result of parsing of a document. /// Contains all paths extracted from document @@ -136,12 +154,14 @@ class JSONDataParser { std::optional parse(const char* begin, size_t length, const ParseConfig& config); private: - struct ParseContext { - struct SkipCacheEntry { - bool is_skipped = false; - std::list::iterator lru_it; - }; + using SkipCacheLru = std::list; + struct SkipCacheEntry { + bool is_skipped = false; + SkipCacheLru::iterator lru_it; + }; + using SkipCache = phmap::flat_hash_map; + struct ParseContext { PathInDataBuilder builder; std::vector paths; std::vector values; @@ -152,10 +172,14 @@ class JSONDataParser { const std::vector>* skip_path_patterns = nullptr; // pre-compiled skip matcher (nullptr means use skip_path_patterns fallback) const variant_util::CompiledSkipMatcher* skip_matcher = nullptr; - // max entries for skip result cache in one parse invocation + // max entries for skip result cache uint16_t skip_result_cache_capacity = 0; - phmap::flat_hash_map skip_cache; - std::list skip_cache_lru; + bool skip_cache_unbounded = false; + SkipCache* skip_cache = nullptr; + SkipCacheLru* skip_cache_lru = nullptr; +#ifdef BE_TEST + SkipCacheStats* skip_cache_stats = nullptr; +#endif // incrementally maintained dot-separated path for skip matching std::string current_path; }; @@ -191,8 +215,17 @@ class JSONDataParser { void traverseAsJsonb(const Element& element, JsonbWriter& writer); void traverseObjectAsJsonb(const JSONObject& object, JsonbWriter& writer); void traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer); + void prepare_skip_cache(const ParseConfig& config, ParseContext& context); + void reset_skip_cache(); ParserImpl parser; + SkipCache skip_cache; + SkipCacheLru skip_cache_lru; + std::shared_ptr skip_cache_matcher_holder; + const std::vector>* skip_cache_patterns = nullptr; + uint16_t skip_cache_config_capacity = 0; + bool skip_cache_adaptive = false; + size_t skip_cache_learned_capacity = 0; }; } // namespace doris::vectorized diff --git a/be/test/olap/rowset/segment_v2/variant_util_test.cpp b/be/test/olap/rowset/segment_v2/variant_util_test.cpp index 870bf98be07ad5..b94212fee27ddd 100644 --- a/be/test/olap/rowset/segment_v2/variant_util_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_util_test.cpp @@ -54,7 +54,7 @@ static uint64_t _splitmix64(uint64_t x) { return x ^ (x >> 31); } -static constexpr size_t kPerfNestedDim = 8; +static constexpr size_t kPerfNestedDim = 10; static constexpr size_t kPerfNestedLeafCount = kPerfNestedDim * kPerfNestedDim * kPerfNestedDim * kPerfNestedDim; @@ -652,12 +652,111 @@ TEST(VariantUtilTest, ParseVariantColumnsApplySkipPatternsFromSchemaChildren) { ASSERT_TRUE(sub_num_b != nullptr); } +TEST(VariantUtilTest, SkipPatternCacheHitsAcrossRows) { + constexpr size_t kRows = 64; + std::vector json_rows; + json_rows.reserve(kRows); + for (size_t i = 0; i < kRows; ++i) { + json_rows.emplace_back("{\"secret\":" + std::to_string(i) + + ",\"keep\":" + std::to_string(i + 1) + "}"); + } + + auto json_column = _make_json_column(json_rows); + + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + }; + std::shared_ptr matcher; + Status st = build_compiled_skip_matcher(skip_patterns, true, &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + + vectorized::ParseConfig cfg; + cfg.enable_flatten_nested = false; + cfg.parse_to = vectorized::ParseConfig::ParseTo::OnlySubcolumns; + cfg.skip_path_patterns = &skip_patterns; + cfg.compiled_skip_matcher = matcher; + cfg.skip_result_cache_capacity = 8; + cfg.adaptive_skip_result_cache_capacity = true; + vectorized::SkipCacheStats cache_stats; + cfg.skip_cache_stats = &cache_stats; + + auto out = vectorized::ColumnVariant::create(0); + parse_json_to_variant(*out, *json_column, cfg); + + const double hit_rate = cache_stats.lookup_count > 0 + ? static_cast(cache_stats.hit_count) / + static_cast(cache_stats.lookup_count) + : 0.0; + const double miss_rate = cache_stats.lookup_count > 0 + ? static_cast(cache_stats.miss_count) / + static_cast(cache_stats.lookup_count) + : 0.0; + LOG(INFO) << "skip cache cross-row stats: " + << "lookups=" << cache_stats.lookup_count << ", " + << "hits=" << cache_stats.hit_count << ", " + << "misses=" << cache_stats.miss_count << ", " + << "hit_rate=" << hit_rate << ", " + << "miss_rate=" << miss_rate << ", " + << "inserts=" << cache_stats.insert_count << ", " + << "evicts=" << cache_stats.evict_count; + + EXPECT_EQ(nullptr, out->get_subcolumn(vectorized::PathInData("secret"))); + EXPECT_TRUE(out->get_subcolumn(vectorized::PathInData("keep")) != nullptr); + // Each row has unique object keys within the row, so cache hits here must be cross-row hits. + EXPECT_EQ(cache_stats.lookup_count, kRows * 2); + EXPECT_EQ(cache_stats.hit_count, (kRows - 1) * 2); + EXPECT_EQ(cache_stats.miss_count, 2); + EXPECT_EQ(cache_stats.insert_count, 2); + EXPECT_EQ(cache_stats.evict_count, 0); +} + +TEST(VariantUtilTest, AdaptiveSkipPatternCacheRoundsUpCapacity) { + std::vector json_rows = { + R"({"a":1,"b":1,"c":1})", + R"({"a":2,"b":2,"c":2,"d":2})", + }; + auto json_column = _make_json_column(json_rows); + + std::vector> skip_patterns = { + {"a", PatternTypePB::SKIP_NAME}, + {"b", PatternTypePB::SKIP_NAME}, + {"c", PatternTypePB::SKIP_NAME}, + {"d", PatternTypePB::SKIP_NAME}, + }; + std::shared_ptr matcher; + Status st = build_compiled_skip_matcher(skip_patterns, true, &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + + vectorized::ParseConfig cfg; + cfg.enable_flatten_nested = false; + cfg.parse_to = vectorized::ParseConfig::ParseTo::OnlySubcolumns; + cfg.skip_path_patterns = &skip_patterns; + cfg.compiled_skip_matcher = matcher; + cfg.skip_result_cache_capacity = 1; + cfg.adaptive_skip_result_cache_capacity = true; + vectorized::SkipCacheStats cache_stats; + cfg.skip_cache_stats = &cache_stats; + + auto out = vectorized::ColumnVariant::create(0); + parse_json_to_variant(*out, *json_column, cfg); + + EXPECT_EQ(nullptr, out->get_subcolumn(vectorized::PathInData("a"))); + EXPECT_EQ(nullptr, out->get_subcolumn(vectorized::PathInData("b"))); + EXPECT_EQ(nullptr, out->get_subcolumn(vectorized::PathInData("c"))); + EXPECT_EQ(nullptr, out->get_subcolumn(vectorized::PathInData("d"))); + + // First row learns 3 skipped keys and rounds capacity up to 4, so inserting 'd' on second row + // should not evict any cached key. + EXPECT_EQ(cache_stats.insert_count, 4); + EXPECT_EQ(cache_stats.evict_count, 0); +} + TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { if (std::getenv("DORIS_RUN_VARIANT_SKIP_PERF_UT") == nullptr) { GTEST_SKIP() << "Set DORIS_RUN_VARIANT_SKIP_PERF_UT=1 to run this heavy perf test."; } - constexpr size_t kRows = 200; + constexpr size_t kRows = 1000; constexpr uint64_t kSeed = 0x20260211ULL; const auto json_rows = _build_nested_json_rows(kRows, kSeed); const auto json_column = _make_json_column(json_rows); @@ -681,6 +780,7 @@ TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { vectorized::ParseConfig optimized_config = legacy_config; optimized_config.compiled_skip_matcher = compiled_matcher; optimized_config.skip_result_cache_capacity = 256; + optimized_config.adaptive_skip_result_cache_capacity = true; auto no_skip_result = _run_parse_perf(*json_column, no_skip_config); auto legacy_result = _run_parse_perf(*json_column, legacy_config); From cff2805fc0aba3027b9fc9d9046d28fce0260491 Mon Sep 17 00:00:00 2001 From: Gary Date: Fri, 13 Feb 2026 23:03:35 +0800 Subject: [PATCH 7/8] performance test --- .../rowset/segment_v2/variant_util_test.cpp | 91 ++++++++++++++----- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/be/test/olap/rowset/segment_v2/variant_util_test.cpp b/be/test/olap/rowset/segment_v2/variant_util_test.cpp index b94212fee27ddd..b5baf7849b7c96 100644 --- a/be/test/olap/rowset/segment_v2/variant_util_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_util_test.cpp @@ -751,7 +751,7 @@ TEST(VariantUtilTest, AdaptiveSkipPatternCacheRoundsUpCapacity) { EXPECT_EQ(cache_stats.evict_count, 0); } -TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { +TEST(VariantUtilTest, SkipPatternPerfCompareOptimizationMatrix) { if (std::getenv("DORIS_RUN_VARIANT_SKIP_PERF_UT") == nullptr) { GTEST_SKIP() << "Set DORIS_RUN_VARIANT_SKIP_PERF_UT=1 to run this heavy perf test."; } @@ -772,22 +772,53 @@ TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { legacy_config.skip_path_patterns = &skip_patterns; legacy_config.compiled_skip_matcher = nullptr; legacy_config.skip_result_cache_capacity = 0; + legacy_config.adaptive_skip_result_cache_capacity = false; - std::shared_ptr compiled_matcher; - Status st = build_compiled_skip_matcher(skip_patterns, true, &compiled_matcher); + std::shared_ptr compiled_matcher_with_re2_set; + Status st = build_compiled_skip_matcher(skip_patterns, true, &compiled_matcher_with_re2_set); ASSERT_TRUE(st.ok()) << st.to_string(); + std::shared_ptr compiled_matcher_without_re2_set; + st = build_compiled_skip_matcher(skip_patterns, false, &compiled_matcher_without_re2_set); + ASSERT_TRUE(st.ok()) << st.to_string(); + + // 3) current optimization - is_skipped cache and RE2::Set both disabled. + vectorized::ParseConfig optimized_no_cache_no_re2set_config = legacy_config; + optimized_no_cache_no_re2set_config.compiled_skip_matcher = compiled_matcher_without_re2_set; + optimized_no_cache_no_re2set_config.skip_result_cache_capacity = 0; + optimized_no_cache_no_re2set_config.adaptive_skip_result_cache_capacity = false; + + // 4) current optimization - is_skipped cache disabled. + vectorized::ParseConfig optimized_no_cache_config = legacy_config; + optimized_no_cache_config.compiled_skip_matcher = compiled_matcher_with_re2_set; + optimized_no_cache_config.skip_result_cache_capacity = 0; + optimized_no_cache_config.adaptive_skip_result_cache_capacity = false; + + // 5) current optimization - RE2::Set disabled. + vectorized::ParseConfig optimized_no_re2set_config = legacy_config; + optimized_no_re2set_config.compiled_skip_matcher = compiled_matcher_without_re2_set; + optimized_no_re2set_config.skip_result_cache_capacity = 256; + optimized_no_re2set_config.adaptive_skip_result_cache_capacity = true; + + // 6) current optimization. vectorized::ParseConfig optimized_config = legacy_config; - optimized_config.compiled_skip_matcher = compiled_matcher; + optimized_config.compiled_skip_matcher = compiled_matcher_with_re2_set; optimized_config.skip_result_cache_capacity = 256; optimized_config.adaptive_skip_result_cache_capacity = true; auto no_skip_result = _run_parse_perf(*json_column, no_skip_config); auto legacy_result = _run_parse_perf(*json_column, legacy_config); + auto optimized_no_cache_no_re2set_result = + _run_parse_perf(*json_column, optimized_no_cache_no_re2set_config); + auto optimized_no_cache_result = _run_parse_perf(*json_column, optimized_no_cache_config); + auto optimized_no_re2set_result = _run_parse_perf(*json_column, optimized_no_re2set_config); auto optimized_result = _run_parse_perf(*json_column, optimized_config); ASSERT_EQ(no_skip_result.column->size(), kRows); ASSERT_EQ(legacy_result.column->size(), kRows); + ASSERT_EQ(optimized_no_cache_no_re2set_result.column->size(), kRows); + ASSERT_EQ(optimized_no_cache_result.column->size(), kRows); + ASSERT_EQ(optimized_no_re2set_result.column->size(), kRows); ASSERT_EQ(optimized_result.column->size(), kRows); vectorized::DataTypeSerDe::FormatOptions options; @@ -795,40 +826,54 @@ TEST(VariantUtilTest, SkipPatternPerfCompareNoSkipLegacyOptimized) { for (size_t row = 0; row < kRows; row += 97) { std::string no_skip_row; std::string legacy_row; + std::string optimized_no_cache_no_re2set_row; + std::string optimized_no_cache_row; + std::string optimized_no_re2set_row; std::string optimized_row; no_skip_result.column->serialize_one_row_to_string(row, &no_skip_row, options); legacy_result.column->serialize_one_row_to_string(row, &legacy_row, options); + optimized_no_cache_no_re2set_result.column->serialize_one_row_to_string( + row, &optimized_no_cache_no_re2set_row, options); + optimized_no_cache_result.column->serialize_one_row_to_string(row, &optimized_no_cache_row, + options); + optimized_no_re2set_result.column->serialize_one_row_to_string(row, &optimized_no_re2set_row, + options); optimized_result.column->serialize_one_row_to_string(row, &optimized_row, options); if (!found_no_skip_difference && no_skip_row != legacy_row) { found_no_skip_difference = true; } + ASSERT_EQ(legacy_row, optimized_no_cache_no_re2set_row) << "row=" << row; + ASSERT_EQ(legacy_row, optimized_no_cache_row) << "row=" << row; + ASSERT_EQ(legacy_row, optimized_no_re2set_row) << "row=" << row; ASSERT_EQ(legacy_row, optimized_row) << "row=" << row; } ASSERT_TRUE(found_no_skip_difference) << "no-skip output should differ from skip-enabled output on sampled rows"; - const double legacy_vs_no_skip = - no_skip_result.elapsed_ms > 0 ? static_cast(legacy_result.elapsed_ms) / - static_cast(no_skip_result.elapsed_ms) - : 0.0; - const double optimized_vs_no_skip = - no_skip_result.elapsed_ms > 0 ? static_cast(optimized_result.elapsed_ms) / - static_cast(no_skip_result.elapsed_ms) - : 0.0; - const double optimized_vs_legacy = - optimized_result.elapsed_ms > 0 - ? static_cast(legacy_result.elapsed_ms) / - static_cast(optimized_result.elapsed_ms) - : 0.0; - - LOG(INFO) << "skip-pattern perf compare (" << kRows << " rows, " << kPerfNestedLeafCount + const auto safe_speedup = [](int64_t faster, int64_t slower) -> double { + return slower > 0 ? static_cast(faster) / static_cast(slower) : 0.0; + }; + + LOG(INFO) << "skip-pattern perf matrix (" << kRows << " rows, " << kPerfNestedLeafCount << " nested columns, same random data): " << "no_skip_ms=" << no_skip_result.elapsed_ms << ", " - << "legacy_ms=" << legacy_result.elapsed_ms + << "legacy_ms=" << legacy_result.elapsed_ms << ", " + << "opt_no_cache_no_re2set_ms=" << optimized_no_cache_no_re2set_result.elapsed_ms + << ", opt_no_cache_ms=" << optimized_no_cache_result.elapsed_ms + << ", opt_no_re2set_ms=" << optimized_no_re2set_result.elapsed_ms << ", optimized_ms=" << optimized_result.elapsed_ms - << ", legacy_vs_no_skip=" << legacy_vs_no_skip - << ", optimized_vs_no_skip=" << optimized_vs_no_skip - << ", optimized_vs_legacy=" << optimized_vs_legacy + << ", speedup_opt_no_cache_no_re2set_vs_legacy=" + << safe_speedup(legacy_result.elapsed_ms, + optimized_no_cache_no_re2set_result.elapsed_ms) + << ", speedup_opt_no_cache_vs_opt_no_cache_no_re2set=" + << safe_speedup(optimized_no_cache_no_re2set_result.elapsed_ms, + optimized_no_cache_result.elapsed_ms) + << ", speedup_optimized_vs_opt_no_re2set=" + << safe_speedup(optimized_no_re2set_result.elapsed_ms, optimized_result.elapsed_ms) + << ", speedup_optimized_vs_opt_no_cache=" + << safe_speedup(optimized_no_cache_result.elapsed_ms, optimized_result.elapsed_ms) + << ", speedup_optimized_vs_legacy=" + << safe_speedup(legacy_result.elapsed_ms, optimized_result.elapsed_ms) << ", skip_patterns=" << skip_patterns.size(); } From 288b22da667f3b2ad306f9989af458f6cde0a63c Mon Sep 17 00:00:00 2001 From: Gary Date: Fri, 13 Feb 2026 23:07:40 +0800 Subject: [PATCH 8/8] fix format --- be/test/olap/rowset/segment_v2/variant_util_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/be/test/olap/rowset/segment_v2/variant_util_test.cpp b/be/test/olap/rowset/segment_v2/variant_util_test.cpp index b5baf7849b7c96..cb5aa29b70da45 100644 --- a/be/test/olap/rowset/segment_v2/variant_util_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_util_test.cpp @@ -836,8 +836,8 @@ TEST(VariantUtilTest, SkipPatternPerfCompareOptimizationMatrix) { row, &optimized_no_cache_no_re2set_row, options); optimized_no_cache_result.column->serialize_one_row_to_string(row, &optimized_no_cache_row, options); - optimized_no_re2set_result.column->serialize_one_row_to_string(row, &optimized_no_re2set_row, - options); + optimized_no_re2set_result.column->serialize_one_row_to_string( + row, &optimized_no_re2set_row, options); optimized_result.column->serialize_one_row_to_string(row, &optimized_row, options); if (!found_no_skip_difference && no_skip_row != legacy_row) { found_no_skip_difference = true;