diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 8a4c9da9fd2009..93ed6ca079bd75 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -82,6 +82,14 @@ bvar::Window> g_contains_agg_with_cache_if_eligible_full_h "g_contains_agg_with_cache_if_eligible_full_hit_1m", &g_contains_agg_with_cache_if_eligible_full_hit, 60); +namespace { + +inline PatternTypePB to_pattern_type_pb(TPatternType::type pattern_type) { + return static_cast(pattern_type); +} + +} // namespace + TabletMetaSharedPtr TabletMeta::create( const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id, uint32_t next_unique_id, @@ -533,13 +541,7 @@ void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tco column->set_variant_max_subcolumns_count(tcolumn.column_type.variant_max_subcolumns_count); } if (tcolumn.__isset.pattern_type) { - switch (tcolumn.pattern_type) { - case TPatternType::MATCH_NAME: - column->set_pattern_type(PatternTypePB::MATCH_NAME); - break; - case TPatternType::MATCH_NAME_GLOB: - column->set_pattern_type(PatternTypePB::MATCH_NAME_GLOB); - } + column->set_pattern_type(to_pattern_type_pb(tcolumn.pattern_type)); } if (tcolumn.__isset.variant_enable_typed_paths_to_sparse) { column->set_variant_enable_typed_paths_to_sparse( diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index d79945f1f89cf5..b6d02ac917b510 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -678,7 +678,7 @@ void TabletColumn::init_from_pb(const ColumnPB& column) { _variant.doc_hash_shard_count = column.variant_doc_hash_shard_count(); } if (column.has_pattern_type()) { - _pattern_type = column.pattern_type(); + _field_pattern_type = column.pattern_type(); } } @@ -755,7 +755,7 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const { column->set_index_length(0); } column->set_variant_max_subcolumns_count(_variant.max_subcolumns_count); - column->set_pattern_type(_pattern_type); + column->set_pattern_type(_field_pattern_type); column->set_variant_enable_typed_paths_to_sparse(_variant.enable_typed_paths_to_sparse); column->set_variant_max_sparse_column_statistics_size( _variant.max_sparse_column_statistics_size); diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 8ed0ee239d1b40..56666c6edd4154 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -234,7 +234,7 @@ class TabletColumn : public MetadataAdder { _variant.max_subcolumns_count = variant_max_subcolumns_count; } - PatternTypePB pattern_type() const { return _pattern_type; } + PatternTypePB pattern_type() const { return _field_pattern_type; } bool variant_enable_typed_paths_to_sparse() const { return _variant.enable_typed_paths_to_sparse; @@ -320,7 +320,8 @@ class TabletColumn : public MetadataAdder { // The extracted sub-columns from "variant" contain the following information: int32_t _parent_col_unique_id = -1; // "variant" -> col_unique_id vectorized::PathInDataPtr _column_path; // the path of the sub-columns themselves - PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB; + // When pattern_type is absent (legacy metadata), keep typed-path default behavior. + PatternTypePB _field_pattern_type = PatternTypePB::MATCH_NAME_GLOB; VariantParams _variant; }; diff --git a/be/src/vec/common/variant_util.cpp b/be/src/vec/common/variant_util.cpp index 069a64798d062a..273c27c48830be 100644 --- a/be/src/vec/common/variant_util.cpp +++ b/be/src/vec/common/variant_util.cpp @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,7 @@ #include "olap/tablet_fwd.h" #include "olap/tablet_schema.h" #include "re2/re2.h" +#include "re2/set.h" #include "runtime/client_cache.h" #include "runtime/define_primitive_type.h" #include "runtime/exec_env.h" @@ -130,6 +132,27 @@ inline void append_escaped_regex_char(std::string* regex_output, char ch) { // Small LRU to cap compiled glob patterns constexpr size_t kGlobRegexCacheCapacity = 256; +constexpr size_t kSkipRe2SetThreshold = 32; + +struct TransparentStringHash { + using is_transparent = void; + size_t operator()(std::string_view s) const { return std::hash {}(s); } + size_t operator()(const std::string& s) const { + return std::hash {}(std::string_view(s)); + } +}; + +struct TransparentStringEq { + using is_transparent = void; + bool operator()(std::string_view lhs, std::string_view rhs) const { return lhs == rhs; } +}; + +struct CompiledSkipMatcher { + phmap::flat_hash_set exact_patterns; + std::vector> glob_regexes; + std::unique_ptr glob_regex_set; + bool use_re2_set = false; +}; struct GlobRegexCacheEntry { std::shared_ptr re2; @@ -259,6 +282,120 @@ bool glob_match_re2(const std::string& glob_pattern, const std::string& candidat return RE2::FullMatch(candidate_path, *compiled); } +Status build_compiled_skip_matcher( + const std::vector>& skip_path_patterns, + bool enable_re2_set, std::shared_ptr* out) { + if (out == nullptr) { + return Status::InvalidArgument("Output pointer for compiled skip matcher is null"); + } + + auto matcher = std::make_shared(); + matcher->exact_patterns.reserve(skip_path_patterns.size()); + + std::vector glob_regex_patterns; + glob_regex_patterns.reserve(skip_path_patterns.size()); + for (const auto& [pattern, pt] : skip_path_patterns) { + if (is_skip_exact_path_pattern_type(pt)) { + matcher->exact_patterns.insert(pattern); + continue; + } + if (!is_skip_glob_path_pattern_type(pt)) { + continue; + } + + std::string regex_pattern; + auto st = glob_to_regex(pattern, ®ex_pattern); + if (!st.ok()) { + continue; + } + glob_regex_patterns.emplace_back(std::move(regex_pattern)); + } + + if (glob_regex_patterns.empty()) { + *out = std::move(matcher); + return Status::OK(); + } + + if (enable_re2_set && glob_regex_patterns.size() >= kSkipRe2SetThreshold) { + RE2::Options options; + auto set = std::make_unique(options, RE2::ANCHOR_BOTH); + for (const auto& regex_pattern : glob_regex_patterns) { + if (set->Add(regex_pattern, nullptr) < 0) { + return Status::InvalidArgument( + "Failed to add regexp '{}' into skip pattern matcher set", regex_pattern); + } + } + if (!set->Compile()) { + return Status::InvalidArgument("Failed to compile skip pattern matcher set"); + } + matcher->glob_regex_set = std::move(set); + matcher->use_re2_set = true; + } else { + matcher->glob_regexes.reserve(glob_regex_patterns.size()); + for (const auto& regex_pattern : glob_regex_patterns) { + auto compiled = std::make_unique(regex_pattern); + if (!compiled->ok()) { + return Status::InvalidArgument( + "Invalid regexp '{}' generated from skip glob pattern: {}", regex_pattern, + compiled->error()); + } + matcher->glob_regexes.emplace_back(std::move(compiled)); + } + } + + *out = std::move(matcher); + return Status::OK(); +} + +bool should_skip_path(const CompiledSkipMatcher& matcher, std::string_view path) { + if (matcher.exact_patterns.find(path) != matcher.exact_patterns.end()) { + return true; + } + + if (matcher.use_re2_set) { + std::vector matched_indexes; + return matcher.glob_regex_set->Match(path, &matched_indexes); + } + + for (const auto& regex : matcher.glob_regexes) { + if (RE2::FullMatch(path, *regex)) { + return true; + } + } + + return false; +} + +namespace { + +inline bool is_variant_skip_path_pattern_type(PatternTypePB pattern_type) { + return pattern_type == PatternTypePB::SKIP_NAME || + pattern_type == PatternTypePB::SKIP_NAME_GLOB; +} + +void collect_variant_skip_path_patterns_from_children( + const TabletColumn& column, + std::vector>* skip_path_patterns) { + skip_path_patterns->clear(); + for (const auto& sub_column : column.get_sub_columns()) { + if (!is_variant_skip_path_pattern_type(sub_column->pattern_type())) { + continue; + } + skip_path_patterns->emplace_back(sub_column->name(), sub_column->pattern_type()); + } +} + +bool has_variant_typed_path_children(const TabletColumn& column) { + for (const auto& sub_column : column.get_sub_columns()) { + if (is_typed_path_pattern_type(sub_column->pattern_type())) { + return true; + } + } + return false; +} + +} // namespace + size_t get_number_of_dimensions(const IDataType& type) { if (const auto* type_array = typeid_cast(&type)) { return type_array->get_number_of_dimensions(); @@ -464,10 +601,11 @@ Status check_variant_has_no_ambiguous_paths(const PathsInData& tuple_paths) { return Status::OK(); } -Status update_least_schema_internal(const std::map& subcolumns_types, - TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, - const std::map& typed_columns, - std::set* path_set) { +Status update_least_schema_internal( + const std::map& subcolumns_types, TabletSchemaSPtr& common_schema, + int32_t variant_col_unique_id, + const std::map& typed_path_columns, + std::set* path_set) { PathsInData tuple_paths; DataTypes tuple_types; CHECK(common_schema.use_count() == 1); @@ -503,10 +641,10 @@ Status update_least_schema_internal(const std::map& subco // Append all common type columns of this variant for (int i = 0; i < tuple_paths.size(); ++i) { TabletColumn common_column; - // typed path not contains root part + // typed path does not include root part auto path_without_root = tuple_paths[i].copy_pop_front().get_path(); - if (typed_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) { - common_column = *typed_columns.at(path_without_root); + if (typed_path_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) { + common_column = *typed_path_columns.at(path_without_root); // parent unique id and path may not be init in write path common_column.set_parent_unique_id(variant_col_unique_id); common_column.set_path_info(tuple_paths[i]); @@ -529,10 +667,13 @@ Status update_least_schema_internal(const std::map& subco Status update_least_common_schema(const std::vector& schemas, TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, std::set* path_set) { - std::map typed_columns; + std::map typed_path_columns; for (const TabletColumnPtr& col : common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) { - typed_columns[col->name()] = col; + if (!is_typed_path_pattern_type(col->pattern_type())) { + continue; + } + typed_path_columns[col->name()] = col; } // Types of subcolumns by path from all tuples. std::map subcolumns_types; @@ -556,7 +697,7 @@ Status update_least_common_schema(const std::vector& schemas, RETURN_IF_ERROR(check_variant_has_no_ambiguous_paths(all_paths)); return update_least_schema_internal(subcolumns_types, common_schema, variant_col_unique_id, - typed_columns, path_set); + typed_path_columns, path_set); } // Keep variant subcolumn BF support aligned with FE DDL checks. @@ -1216,7 +1357,8 @@ Status VariantCompactionUtil::get_extended_compaction_schema( uid_to_paths_set_info[column->unique_id()]); // 4. append subcolumns - if (column->variant_max_subcolumns_count() > 0 || !column->get_sub_columns().empty()) { + if (column->variant_max_subcolumns_count() > 0 || + has_variant_typed_path_children(*column)) { get_compaction_subcolumns_from_subpaths( uid_to_paths_set_info[column->unique_id()], column, target, uid_to_variant_extended_info[column->unique_id()].path_to_data_types, @@ -2039,6 +2181,8 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t } std::vector configs(variant_column_pos.size()); + std::vector>> variant_skip_path_patterns( + variant_column_pos.size()); for (size_t i = 0; i < variant_column_pos.size(); ++i) { configs[i].enable_flatten_nested = tablet_schema.variant_flatten_nested(); const auto& column = tablet_schema.column(variant_column_pos[i]); @@ -2046,6 +2190,14 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t return Status::InternalError("column is not variant type, column name: {}", column.name()); } + // Set skip path patterns if configured on variant children. + collect_variant_skip_path_patterns_from_children(column, &variant_skip_path_patterns[i]); + if (!variant_skip_path_patterns[i].empty()) { + configs[i].skip_path_patterns = &variant_skip_path_patterns[i]; + RETURN_IF_ERROR(build_compiled_skip_matcher(variant_skip_path_patterns[i], true, + &configs[i].compiled_skip_matcher)); + configs[i].adaptive_skip_result_cache_capacity = true; + } // if doc mode is not enabled, no need to parse to doc value column if (!column.variant_enable_doc_mode()) { configs[i].parse_to = ParseConfig::ParseTo::OnlySubcolumns; diff --git a/be/src/vec/common/variant_util.h b/be/src/vec/common/variant_util.h index a36179ac0fbf50..56a9741802f703 100644 --- a/be/src/vec/common/variant_util.h +++ b/be/src/vec/common/variant_util.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -65,12 +66,53 @@ const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__"; const std::string DOC_VALUE_COLUMN_PATH = "__DORIS_VARIANT_DOC_VALUE__"; namespace doris::vectorized::variant_util { +struct CompiledSkipMatcher; + +inline bool is_typed_path_pattern_type(PatternTypePB pattern_type) { + return pattern_type == PatternTypePB::MATCH_NAME || + pattern_type == PatternTypePB::MATCH_NAME_GLOB; +} + +inline bool is_skip_exact_path_pattern_type(PatternTypePB pattern_type) { + return pattern_type == PatternTypePB::SKIP_NAME; +} + +inline bool is_skip_glob_path_pattern_type(PatternTypePB pattern_type) { + return pattern_type == PatternTypePB::SKIP_NAME_GLOB; +} + // Convert a restricted glob pattern into a regex (for tests/internal use). Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern); // Match a glob pattern against a path using RE2. bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path); +// Build an immutable matcher for skip path patterns used in hot parsing paths. +Status build_compiled_skip_matcher( + const std::vector>& skip_path_patterns, + bool enable_re2_set, std::shared_ptr* out); + +// Match a dot-separated path against precompiled skip path patterns. +bool should_skip_path(const CompiledSkipMatcher& matcher, std::string_view path); + +// Check if a dot-separated path should be skipped based on skip path patterns. +// For SKIP_NAME_GLOB, uses glob matching; for SKIP_NAME, uses exact string comparison. +inline bool should_skip_path( + const std::vector>& skip_path_patterns, + const std::string& path) { + for (const auto& [pattern, pt] : skip_path_patterns) { + if (is_skip_exact_path_pattern_type(pt) && path == pattern) { + return true; + } + } + for (const auto& [pattern, pt] : skip_path_patterns) { + if (is_skip_glob_path_pattern_type(pt) && glob_match_re2(pattern, path)) { + return true; + } + } + return false; +} + using PathToNoneNullValues = std::unordered_map; using PathToDataTypes = std::unordered_map, PathInData::Hash>; @@ -166,10 +208,11 @@ bool inherit_index(const std::vector& parent_indexes, bool inherit_index(const std::vector& parent_indexes, TabletIndexes& sub_column_indexes, const segment_v2::ColumnMetaPB& column_pb); -Status update_least_schema_internal(const std::map& subcolumns_types, - TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id, - const std::map& typed_columns, - std::set* path_set = nullptr); +Status update_least_schema_internal( + const std::map& subcolumns_types, TabletSchemaSPtr& common_schema, + int32_t variant_col_unique_id, + const std::map& typed_path_columns, + std::set* path_set = nullptr); bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, const std::string& path, diff --git a/be/src/vec/json/json_parser.cpp b/be/src/vec/json/json_parser.cpp index eb4d6c5e2b5fee..a188c986a59244 100644 --- a/be/src/vec/json/json_parser.cpp +++ b/be/src/vec/json/json_parser.cpp @@ -25,17 +25,80 @@ #include #include +#include #include #include "common/cast_set.h" #include "common/config.h" #include "common/status.h" +#include "vec/common/variant_util.h" #include "vec/json/path_in_data.h" #include "vec/json/simd_json_parser.h" namespace doris::vectorized { #include "common/compile_check_begin.h" +constexpr size_t kAdaptiveSkipCacheMaxCapacity = 1UL << 14; // 16384 + +template +void JSONDataParser::reset_skip_cache() { + skip_cache.clear(); + skip_cache_lru.clear(); +} + +template +void JSONDataParser::prepare_skip_cache(const ParseConfig& config, + ParseContext& context) { + const bool has_skip_path_patterns = + context.skip_matcher != nullptr || + (context.skip_path_patterns != nullptr && !context.skip_path_patterns->empty()); + if (!has_skip_path_patterns || context.skip_result_cache_capacity == 0) { + reset_skip_cache(); + skip_cache_matcher_holder.reset(); + skip_cache_patterns = nullptr; + skip_cache_config_capacity = 0; + skip_cache_adaptive = false; + skip_cache_learned_capacity = 0; + context.skip_cache = nullptr; + context.skip_cache_lru = nullptr; + context.skip_cache_unbounded = false; + return; + } + + const bool adaptive = config.adaptive_skip_result_cache_capacity; + const bool matcher_changed = context.skip_matcher != nullptr + ? skip_cache_matcher_holder.get() != context.skip_matcher + : skip_cache_matcher_holder != nullptr; + const bool patterns_changed = + context.skip_matcher == nullptr && skip_cache_patterns != context.skip_path_patterns; + const bool cache_config_changed = + matcher_changed || patterns_changed || + skip_cache_config_capacity != context.skip_result_cache_capacity || + skip_cache_adaptive != adaptive; + if (cache_config_changed) { + reset_skip_cache(); + skip_cache_matcher_holder = config.compiled_skip_matcher; + skip_cache_patterns = context.skip_path_patterns; + skip_cache_config_capacity = context.skip_result_cache_capacity; + skip_cache_adaptive = adaptive; + skip_cache_learned_capacity = 0; + if (!skip_cache_adaptive) { + skip_cache.reserve(skip_cache_config_capacity); + } + } + context.skip_cache = &skip_cache; + context.skip_cache_lru = &skip_cache_lru; + context.skip_cache_unbounded = false; + if (skip_cache_adaptive) { + if (skip_cache_learned_capacity == 0) { + context.skip_cache_unbounded = true; + } else { + context.skip_result_cache_capacity = static_cast( + std::min(skip_cache_learned_capacity, kAdaptiveSkipCacheMaxCapacity)); + } + } +} + template std::optional JSONDataParser::parse(const char* begin, size_t length, const ParseConfig& config) { @@ -46,7 +109,22 @@ std::optional JSONDataParser::parse(const char* begin, ParseContext context; context.enable_flatten_nested = config.enable_flatten_nested; context.is_top_array = document.isArray(); + context.skip_path_patterns = config.skip_path_patterns; + context.skip_matcher = config.compiled_skip_matcher.get(); + context.skip_result_cache_capacity = config.skip_result_cache_capacity; +#ifdef BE_TEST + context.skip_cache_stats = config.skip_cache_stats; +#endif + prepare_skip_cache(config, context); traverse(document, context); + if (skip_cache_adaptive && context.skip_cache != nullptr && context.skip_cache_unbounded && + !skip_cache.empty()) { + const size_t learned_capacity = std::min(skip_cache.size(), kAdaptiveSkipCacheMaxCapacity); + if (learned_capacity > 0) { + const size_t rounded_capacity = std::bit_ceil(learned_capacity); + skip_cache_learned_capacity = std::min(rounded_capacity, kAdaptiveSkipCacheMaxCapacity); + } + } ParseResult result; result.values = std::move(context.values); result.paths.reserve(context.paths.size()); @@ -100,9 +178,98 @@ void JSONDataParser::traverseObject(const JSONObject& object, ParseC fmt::format("Key length exceeds maximum allowed size of {} bytes.", max_key_length)); } - ctx.builder.append(key, false); - traverse(value, ctx); - ctx.builder.pop_back(); + const bool has_skip_path_patterns = + ctx.skip_matcher != nullptr || + (ctx.skip_path_patterns != nullptr && !ctx.skip_path_patterns->empty()); + // Check skip path patterns: build the dot-separated path and test against patterns. + if (has_skip_path_patterns) { + const size_t old_length = ctx.current_path.size(); + const size_t required_capacity = old_length + (old_length ? 1 : 0) + key.size(); + if (ctx.current_path.capacity() < required_capacity) { + ctx.current_path.reserve(required_capacity); + } + if (!ctx.current_path.empty()) { + ctx.current_path.push_back('.'); + } + ctx.current_path.append(key.data(), key.size()); + + bool is_skipped = false; + if (ctx.skip_cache != nullptr) { +#ifdef BE_TEST + if (ctx.skip_cache_stats != nullptr) { + ++ctx.skip_cache_stats->lookup_count; + } +#endif + auto cache_it = ctx.skip_cache->find(ctx.current_path); + if (cache_it != ctx.skip_cache->end()) { + is_skipped = cache_it->second.is_skipped; + ctx.skip_cache_lru->splice(ctx.skip_cache_lru->begin(), *ctx.skip_cache_lru, + cache_it->second.lru_it); +#ifdef BE_TEST + if (ctx.skip_cache_stats != nullptr) { + ++ctx.skip_cache_stats->hit_count; + } +#endif + } else { +#ifdef BE_TEST + if (ctx.skip_cache_stats != nullptr) { + ++ctx.skip_cache_stats->miss_count; + } +#endif + if (ctx.skip_matcher != nullptr) { + is_skipped = + variant_util::should_skip_path(*ctx.skip_matcher, ctx.current_path); + } else { + is_skipped = variant_util::should_skip_path(*ctx.skip_path_patterns, + ctx.current_path); + } + + const size_t cache_capacity = ctx.skip_cache_unbounded + ? kAdaptiveSkipCacheMaxCapacity + : ctx.skip_result_cache_capacity; + if (ctx.skip_cache->size() >= cache_capacity && !ctx.skip_cache_lru->empty()) { + const auto& evicted_key = ctx.skip_cache_lru->back(); + ctx.skip_cache->erase(evicted_key); + ctx.skip_cache_lru->pop_back(); +#ifdef BE_TEST + if (ctx.skip_cache_stats != nullptr) { + ++ctx.skip_cache_stats->evict_count; + } +#endif + } + + ctx.skip_cache_lru->push_front(ctx.current_path); + SkipCacheEntry cache_entry; + cache_entry.is_skipped = is_skipped; + cache_entry.lru_it = ctx.skip_cache_lru->begin(); + ctx.skip_cache->emplace(std::string_view(*cache_entry.lru_it), + std::move(cache_entry)); +#ifdef BE_TEST + if (ctx.skip_cache_stats != nullptr) { + ++ctx.skip_cache_stats->insert_count; + } +#endif + } + } else if (ctx.skip_matcher != nullptr) { + is_skipped = variant_util::should_skip_path(*ctx.skip_matcher, ctx.current_path); + } else { + is_skipped = + variant_util::should_skip_path(*ctx.skip_path_patterns, ctx.current_path); + } + + if (is_skipped) { + ctx.current_path.resize(old_length); + continue; // skip this key and its entire subtree + } + ctx.builder.append(key, false); + traverse(value, ctx); + ctx.builder.pop_back(); + ctx.current_path.resize(old_length); + } else { + ctx.builder.append(key, false); + traverse(value, ctx); + ctx.builder.pop_back(); + } } } @@ -206,8 +373,19 @@ void JSONDataParser::traverseArrayElement(const Element& element, ParseContext element_ctx; element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten; element_ctx.is_top_array = ctx.is_top_array; + element_ctx.skip_path_patterns = nullptr; + element_ctx.skip_matcher = nullptr; + element_ctx.skip_result_cache_capacity = 0; + element_ctx.skip_cache_unbounded = false; + element_ctx.skip_cache = nullptr; + element_ctx.skip_cache_lru = nullptr; +#ifdef BE_TEST + element_ctx.skip_cache_stats = nullptr; +#endif traverse(element, element_ctx); - auto& [_, paths, values, flatten_nested, __, is_top_array] = element_ctx; + auto& paths = element_ctx.paths; + auto& values = element_ctx.values; + const bool is_top_array = element_ctx.is_top_array; if (element_ctx.has_nested_in_flatten && is_top_array) { checkAmbiguousStructure(ctx, paths); diff --git a/be/src/vec/json/json_parser.h b/be/src/vec/json/json_parser.h index 69d900ee96db56..ccf7e0ba08b55f 100644 --- a/be/src/vec/json/json_parser.h +++ b/be/src/vec/json/json_parser.h @@ -23,11 +23,15 @@ #include #include +#include +#include +#include #include #include #include #include +#include "gen_cpp/olap_file.pb.h" #include "runtime/primitive_type.h" #include "util/jsonb_writer.h" #include "vec/columns/column.h" @@ -38,6 +42,9 @@ #include "vec/json/simd_json_parser.h" namespace doris::vectorized { +namespace variant_util { +struct CompiledSkipMatcher; +} template Field getValueAsField(const Element& element) { @@ -99,6 +106,16 @@ void writeValueAsJsonb(const Element& element, JsonbWriter& writer) { } } +#ifdef BE_TEST +struct SkipCacheStats { + uint64_t lookup_count = 0; + uint64_t hit_count = 0; + uint64_t miss_count = 0; + uint64_t insert_count = 0; + uint64_t evict_count = 0; +}; +#endif + struct ParseConfig { bool enable_flatten_nested = false; enum class ParseTo { @@ -107,6 +124,19 @@ struct ParseConfig { BothSubcolumnsAndDocValueColumn = 2, }; ParseTo parse_to = ParseTo::OnlySubcolumns; + // skip path patterns for variant column (pointer to avoid copy; nullptr means no skip) + const std::vector>* skip_path_patterns = nullptr; + // pre-compiled skip matcher for hot parsing path + std::shared_ptr compiled_skip_matcher = nullptr; + // max entries for "path -> skip result" cache, 0 means disabled + uint16_t skip_result_cache_capacity = 256; + // if true, first effective row learns cache size (capped at 16384) and rounds it up to a + // power of two, then reuses learned size. + bool adaptive_skip_result_cache_capacity = false; +#ifdef BE_TEST + // optional cache stats for tests/observability + SkipCacheStats* skip_cache_stats = nullptr; +#endif }; /// Result of parsing of a document. /// Contains all paths extracted from document @@ -124,6 +154,13 @@ class JSONDataParser { std::optional parse(const char* begin, size_t length, const ParseConfig& config); private: + using SkipCacheLru = std::list; + struct SkipCacheEntry { + bool is_skipped = false; + SkipCacheLru::iterator lru_it; + }; + using SkipCache = phmap::flat_hash_map; + struct ParseContext { PathInDataBuilder builder; std::vector paths; @@ -131,6 +168,20 @@ class JSONDataParser { bool enable_flatten_nested = false; bool has_nested_in_flatten = false; bool is_top_array = false; + // skip path patterns pointer (nullptr means no skip) + const std::vector>* skip_path_patterns = nullptr; + // pre-compiled skip matcher (nullptr means use skip_path_patterns fallback) + const variant_util::CompiledSkipMatcher* skip_matcher = nullptr; + // max entries for skip result cache + uint16_t skip_result_cache_capacity = 0; + bool skip_cache_unbounded = false; + SkipCache* skip_cache = nullptr; + SkipCacheLru* skip_cache_lru = nullptr; +#ifdef BE_TEST + SkipCacheStats* skip_cache_stats = nullptr; +#endif + // incrementally maintained dot-separated path for skip matching + std::string current_path; }; using PathPartsWithArray = std::pair; using PathToArray = phmap::flat_hash_map; @@ -164,8 +215,17 @@ class JSONDataParser { void traverseAsJsonb(const Element& element, JsonbWriter& writer); void traverseObjectAsJsonb(const JSONObject& object, JsonbWriter& writer); void traverseArrayAsJsonb(const JSONArray& array, JsonbWriter& writer); + void prepare_skip_cache(const ParseConfig& config, ParseContext& context); + void reset_skip_cache(); ParserImpl parser; + SkipCache skip_cache; + SkipCacheLru skip_cache_lru; + std::shared_ptr skip_cache_matcher_holder; + const std::vector>* skip_cache_patterns = nullptr; + uint16_t skip_cache_config_capacity = 0; + bool skip_cache_adaptive = false; + size_t skip_cache_learned_capacity = 0; }; } // namespace doris::vectorized diff --git a/be/test/olap/rowset/segment_v2/variant_util_test.cpp b/be/test/olap/rowset/segment_v2/variant_util_test.cpp index bb87ee0ebd7d78..cb5aa29b70da45 100644 --- a/be/test/olap/rowset/segment_v2/variant_util_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_util_test.cpp @@ -17,11 +17,16 @@ #include "testutil/variant_util.h" +#include +#include +#include +#include #include #include #include #include "gen_cpp/olap_file.pb.h" +#include "glog/logging.h" #include "gtest/gtest.h" #include "olap/tablet_schema.h" #include "vec/columns/column_string.h" @@ -42,6 +47,173 @@ static vectorized::ColumnString::MutablePtr _make_json_column( return col; } +static uint64_t _splitmix64(uint64_t x) { + x += 0x9e3779b97f4a7c15ULL; + x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; + x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; + return x ^ (x >> 31); +} + +static constexpr size_t kPerfNestedDim = 10; +static constexpr size_t kPerfNestedLeafCount = + kPerfNestedDim * kPerfNestedDim * kPerfNestedDim * kPerfNestedDim; + +static std::string _path_of_leaf_id(size_t leaf_id) { + const size_t g = leaf_id / (kPerfNestedDim * kPerfNestedDim * kPerfNestedDim); + const size_t s = (leaf_id / (kPerfNestedDim * kPerfNestedDim)) % kPerfNestedDim; + const size_t t = (leaf_id / kPerfNestedDim) % kPerfNestedDim; + const size_t k = leaf_id % kPerfNestedDim; + std::string path; + path.reserve(16); + path += "g"; + path.push_back(static_cast('0' + g)); + path += ".s"; + path.push_back(static_cast('0' + s)); + path += ".t"; + path.push_back(static_cast('0' + t)); + path += ".k"; + path.push_back(static_cast('0' + k)); + return path; +} + +static std::string _build_nested_json_row(size_t row_idx, uint64_t seed) { + std::string root; + root.reserve(220000); + root.push_back('{'); + bool first_g = true; + for (size_t g = 0; g < kPerfNestedDim; ++g) { + std::string g_obj; + g_obj.push_back('{'); + bool first_s = true; + for (size_t s = 0; s < kPerfNestedDim; ++s) { + std::string s_obj; + s_obj.push_back('{'); + bool first_t = true; + for (size_t t = 0; t < kPerfNestedDim; ++t) { + std::string t_obj; + t_obj.push_back('{'); + bool first_k = true; + for (size_t k = 0; k < kPerfNestedDim; ++k) { + const size_t leaf_id = + ((g * kPerfNestedDim + s) * kPerfNestedDim + t) * kPerfNestedDim + k; + // Keep many nested columns per row to stress skip-pattern matching. + if (!first_k) { + t_obj.push_back(','); + } + first_k = false; + const uint64_t value = + _splitmix64(seed ^ (static_cast(row_idx) << 32) ^ leaf_id) % + 1000003ULL; + t_obj += "\"k"; + t_obj.push_back(static_cast('0' + k)); + t_obj += "\":"; + t_obj += std::to_string(value); + } + if (!first_k) { + t_obj.push_back('}'); + if (!first_t) { + s_obj.push_back(','); + } + first_t = false; + s_obj += "\"t"; + s_obj.push_back(static_cast('0' + t)); + s_obj += "\":"; + s_obj += t_obj; + } + } + if (!first_t) { + s_obj.push_back('}'); + if (!first_s) { + g_obj.push_back(','); + } + first_s = false; + g_obj += "\"s"; + g_obj.push_back(static_cast('0' + s)); + g_obj += "\":"; + g_obj += s_obj; + } + } + if (!first_s) { + g_obj.push_back('}'); + if (!first_g) { + root.push_back(','); + } + first_g = false; + root += "\"g"; + root.push_back(static_cast('0' + g)); + root += "\":"; + root += g_obj; + } + } + root += ",\"meta\":{\"row_id\":"; + root += std::to_string(row_idx); + root += ",\"rand\":"; + root += std::to_string(_splitmix64(seed + row_idx) % 9973ULL); + root += "}}"; + return root; +} + +static std::vector _build_nested_json_rows(size_t rows, uint64_t seed) { + std::vector result; + result.reserve(rows); + for (size_t i = 0; i < rows; ++i) { + result.emplace_back(_build_nested_json_row(i, seed)); + } + return result; +} + +static vectorized::ColumnString::MutablePtr _make_json_column( + const std::vector& rows) { + auto col = vectorized::ColumnString::create(); + for (const auto& row : rows) { + col->insert_data(row.data(), row.size()); + } + return col; +} + +static std::vector> _build_skip_patterns_for_perf() { + std::vector> patterns; + patterns.reserve(96); + + // Exact match patterns. + for (size_t leaf_id = 0; leaf_id < kPerfNestedLeafCount; leaf_id += 211) { + patterns.emplace_back(_path_of_leaf_id(leaf_id), PatternTypePB::SKIP_NAME); + } + + // Unmatched glob patterns to amplify old per-pattern matching cost. + for (int i = 0; i < 30; ++i) { + patterns.emplace_back("x" + std::to_string(i) + "*.s?.t?.k?", + PatternTypePB::SKIP_NAME_GLOB); + } + + // Matched glob patterns. + for (size_t g = 0; g < kPerfNestedDim; ++g) { + std::string pattern = "g"; + pattern.push_back(static_cast('0' + g)); + pattern += ".s?.t?.k[02468]"; + patterns.emplace_back(std::move(pattern), PatternTypePB::SKIP_NAME_GLOB); + } + + return patterns; +} + +struct PerfParseResult { + vectorized::ColumnVariant::MutablePtr column; + int64_t elapsed_ms = 0; +}; + +static PerfParseResult _run_parse_perf(const vectorized::ColumnString& json_column, + const vectorized::ParseConfig& config) { + auto variant = vectorized::ColumnVariant::create(0); + const auto start = std::chrono::steady_clock::now(); + parse_json_to_variant(*variant, json_column, config); + const auto end = std::chrono::steady_clock::now(); + PerfParseResult result; + result.column = std::move(variant); + result.elapsed_ms = std::chrono::duration_cast(end - start).count(); + return result; +} + TEST(VariantUtilTest, ParseDocValueToSubcolumns_FillsDefaultsAndValues) { const std::vector jsons = { R"({"a":1,"b":"x"})", // @@ -341,4 +513,368 @@ TEST(VariantUtilTest, GlobMatchRe2) { EXPECT_FALSE(glob_match_re2("a[\\]b", "a]b")); } +TEST(VariantUtilTest, ShouldSkipPathLegacyPatterns) { + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + {"debug_*", PatternTypePB::SKIP_NAME_GLOB}, + {"typed_*", PatternTypePB::MATCH_NAME_GLOB}, + }; + + EXPECT_TRUE(should_skip_path(skip_patterns, "secret")); + EXPECT_TRUE(should_skip_path(skip_patterns, "debug_field")); + EXPECT_FALSE(should_skip_path(skip_patterns, "typed_field")); + EXPECT_FALSE(should_skip_path(skip_patterns, "other")); +} + +TEST(VariantUtilTest, PatternTypeHelpers) { + EXPECT_TRUE(is_typed_path_pattern_type(PatternTypePB::MATCH_NAME)); + EXPECT_TRUE(is_typed_path_pattern_type(PatternTypePB::MATCH_NAME_GLOB)); + EXPECT_FALSE(is_typed_path_pattern_type(PatternTypePB::SKIP_NAME)); + EXPECT_FALSE(is_typed_path_pattern_type(PatternTypePB::SKIP_NAME_GLOB)); + + EXPECT_TRUE(is_skip_exact_path_pattern_type(PatternTypePB::SKIP_NAME)); + EXPECT_FALSE(is_skip_exact_path_pattern_type(PatternTypePB::SKIP_NAME_GLOB)); + EXPECT_TRUE(is_skip_glob_path_pattern_type(PatternTypePB::SKIP_NAME_GLOB)); + EXPECT_FALSE(is_skip_glob_path_pattern_type(PatternTypePB::MATCH_NAME_GLOB)); +} + +TEST(VariantUtilTest, BuildCompiledSkipMatcherRejectsNullOutPointer) { + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + }; + Status st = build_compiled_skip_matcher(skip_patterns, true, nullptr); + EXPECT_FALSE(st.ok()); +} + +TEST(VariantUtilTest, BuildCompiledSkipMatcherMixedPatterns) { + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + {"debug_*", PatternTypePB::SKIP_NAME_GLOB}, + {"[invalid", PatternTypePB::SKIP_NAME_GLOB}, + {"typed_*", PatternTypePB::MATCH_NAME_GLOB}, + }; + + std::shared_ptr matcher; + Status st = build_compiled_skip_matcher(skip_patterns, false, &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + ASSERT_TRUE(matcher != nullptr); + + EXPECT_TRUE(should_skip_path(*matcher, "secret")); + EXPECT_TRUE(should_skip_path(*matcher, "debug_field")); + EXPECT_FALSE(should_skip_path(*matcher, "typed_field")); + EXPECT_FALSE(should_skip_path(*matcher, "other")); +} + +TEST(VariantUtilTest, BuildCompiledSkipMatcherWithRe2Set) { + std::vector> skip_patterns; + for (int i = 0; i < 40; ++i) { + skip_patterns.emplace_back("k" + std::to_string(i) + "_*", PatternTypePB::SKIP_NAME_GLOB); + } + + std::shared_ptr matcher; + Status st = build_compiled_skip_matcher(skip_patterns, true, &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + ASSERT_TRUE(matcher != nullptr); + + EXPECT_TRUE(should_skip_path(*matcher, "k1_abc")); + EXPECT_TRUE(should_skip_path(*matcher, "k39_abc")); + EXPECT_FALSE(should_skip_path(*matcher, "unknown_abc")); +} + +TEST(VariantUtilTest, ParseVariantColumnsApplySkipPatternsFromSchemaChildren) { + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(KeysType::DUP_KEYS); + auto* c = schema_pb.add_column(); + c->set_unique_id(1); + c->set_name("v"); + c->set_type("VARIANT"); + c->set_is_key(false); + c->set_is_nullable(false); + c->set_variant_enable_doc_mode(false); + + // Typed path: should not be skipped. + auto* typed = c->add_children_columns(); + typed->set_unique_id(2); + typed->set_name("num_*"); + typed->set_type("BIGINT"); + typed->set_is_key(false); + typed->set_is_nullable(true); + typed->set_pattern_type(PatternTypePB::MATCH_NAME_GLOB); + + // Skip exact. + auto* skip_exact = c->add_children_columns(); + skip_exact->set_unique_id(3); + skip_exact->set_name("secret"); + skip_exact->set_type("STRING"); + skip_exact->set_is_key(false); + skip_exact->set_is_nullable(true); + skip_exact->set_pattern_type(PatternTypePB::SKIP_NAME); + + // Skip glob. + auto* skip_glob = c->add_children_columns(); + skip_glob->set_unique_id(4); + skip_glob->set_name("debug_*"); + skip_glob->set_type("STRING"); + skip_glob->set_is_key(false); + skip_glob->set_is_nullable(true); + skip_glob->set_pattern_type(PatternTypePB::SKIP_NAME_GLOB); + + TabletSchema tablet_schema; + tablet_schema.init_from_pb(schema_pb); + + auto variant = vectorized::ColumnVariant::create(0); + doris::VariantUtil::insert_root_scalar_field( + *variant, vectorized::Field::create_field( + String(R"({"secret":1,"debug_a":2,"keep":3,"num_a":4})"))); + doris::VariantUtil::insert_root_scalar_field( + *variant, vectorized::Field::create_field( + String(R"({"secret":5,"debug_b":6,"keep":7,"num_b":8})"))); + + vectorized::Block block; + block.insert({variant->get_ptr(), std::make_shared(0), "v"}); + + Status st = + parse_and_materialize_variant_columns(block, tablet_schema, std::vector {0}); + ASSERT_TRUE(st.ok()) << st.to_string(); + + const auto& col0 = *block.get_by_position(0).column; + const auto& out = assert_cast(col0); + + EXPECT_EQ(nullptr, out.get_subcolumn(vectorized::PathInData("secret"))); + EXPECT_EQ(nullptr, out.get_subcolumn(vectorized::PathInData("debug_a"))); + EXPECT_EQ(nullptr, out.get_subcolumn(vectorized::PathInData("debug_b"))); + + const auto* sub_keep = out.get_subcolumn(vectorized::PathInData("keep")); + const auto* sub_num_a = out.get_subcolumn(vectorized::PathInData("num_a")); + const auto* sub_num_b = out.get_subcolumn(vectorized::PathInData("num_b")); + ASSERT_TRUE(sub_keep != nullptr); + ASSERT_TRUE(sub_num_a != nullptr); + ASSERT_TRUE(sub_num_b != nullptr); +} + +TEST(VariantUtilTest, SkipPatternCacheHitsAcrossRows) { + constexpr size_t kRows = 64; + std::vector json_rows; + json_rows.reserve(kRows); + for (size_t i = 0; i < kRows; ++i) { + json_rows.emplace_back("{\"secret\":" + std::to_string(i) + + ",\"keep\":" + std::to_string(i + 1) + "}"); + } + + auto json_column = _make_json_column(json_rows); + + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + }; + std::shared_ptr matcher; + Status st = build_compiled_skip_matcher(skip_patterns, true, &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + + vectorized::ParseConfig cfg; + cfg.enable_flatten_nested = false; + cfg.parse_to = vectorized::ParseConfig::ParseTo::OnlySubcolumns; + cfg.skip_path_patterns = &skip_patterns; + cfg.compiled_skip_matcher = matcher; + cfg.skip_result_cache_capacity = 8; + cfg.adaptive_skip_result_cache_capacity = true; + vectorized::SkipCacheStats cache_stats; + cfg.skip_cache_stats = &cache_stats; + + auto out = vectorized::ColumnVariant::create(0); + parse_json_to_variant(*out, *json_column, cfg); + + const double hit_rate = cache_stats.lookup_count > 0 + ? static_cast(cache_stats.hit_count) / + static_cast(cache_stats.lookup_count) + : 0.0; + const double miss_rate = cache_stats.lookup_count > 0 + ? static_cast(cache_stats.miss_count) / + static_cast(cache_stats.lookup_count) + : 0.0; + LOG(INFO) << "skip cache cross-row stats: " + << "lookups=" << cache_stats.lookup_count << ", " + << "hits=" << cache_stats.hit_count << ", " + << "misses=" << cache_stats.miss_count << ", " + << "hit_rate=" << hit_rate << ", " + << "miss_rate=" << miss_rate << ", " + << "inserts=" << cache_stats.insert_count << ", " + << "evicts=" << cache_stats.evict_count; + + EXPECT_EQ(nullptr, out->get_subcolumn(vectorized::PathInData("secret"))); + EXPECT_TRUE(out->get_subcolumn(vectorized::PathInData("keep")) != nullptr); + // Each row has unique object keys within the row, so cache hits here must be cross-row hits. + EXPECT_EQ(cache_stats.lookup_count, kRows * 2); + EXPECT_EQ(cache_stats.hit_count, (kRows - 1) * 2); + EXPECT_EQ(cache_stats.miss_count, 2); + EXPECT_EQ(cache_stats.insert_count, 2); + EXPECT_EQ(cache_stats.evict_count, 0); +} + +TEST(VariantUtilTest, AdaptiveSkipPatternCacheRoundsUpCapacity) { + std::vector json_rows = { + R"({"a":1,"b":1,"c":1})", + R"({"a":2,"b":2,"c":2,"d":2})", + }; + auto json_column = _make_json_column(json_rows); + + std::vector> skip_patterns = { + {"a", PatternTypePB::SKIP_NAME}, + {"b", PatternTypePB::SKIP_NAME}, + {"c", PatternTypePB::SKIP_NAME}, + {"d", PatternTypePB::SKIP_NAME}, + }; + std::shared_ptr matcher; + Status st = build_compiled_skip_matcher(skip_patterns, true, &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + + vectorized::ParseConfig cfg; + cfg.enable_flatten_nested = false; + cfg.parse_to = vectorized::ParseConfig::ParseTo::OnlySubcolumns; + cfg.skip_path_patterns = &skip_patterns; + cfg.compiled_skip_matcher = matcher; + cfg.skip_result_cache_capacity = 1; + cfg.adaptive_skip_result_cache_capacity = true; + vectorized::SkipCacheStats cache_stats; + cfg.skip_cache_stats = &cache_stats; + + auto out = vectorized::ColumnVariant::create(0); + parse_json_to_variant(*out, *json_column, cfg); + + EXPECT_EQ(nullptr, out->get_subcolumn(vectorized::PathInData("a"))); + EXPECT_EQ(nullptr, out->get_subcolumn(vectorized::PathInData("b"))); + EXPECT_EQ(nullptr, out->get_subcolumn(vectorized::PathInData("c"))); + EXPECT_EQ(nullptr, out->get_subcolumn(vectorized::PathInData("d"))); + + // First row learns 3 skipped keys and rounds capacity up to 4, so inserting 'd' on second row + // should not evict any cached key. + EXPECT_EQ(cache_stats.insert_count, 4); + EXPECT_EQ(cache_stats.evict_count, 0); +} + +TEST(VariantUtilTest, SkipPatternPerfCompareOptimizationMatrix) { + if (std::getenv("DORIS_RUN_VARIANT_SKIP_PERF_UT") == nullptr) { + GTEST_SKIP() << "Set DORIS_RUN_VARIANT_SKIP_PERF_UT=1 to run this heavy perf test."; + } + + constexpr size_t kRows = 1000; + constexpr uint64_t kSeed = 0x20260211ULL; + const auto json_rows = _build_nested_json_rows(kRows, kSeed); + const auto json_column = _make_json_column(json_rows); + const auto skip_patterns = _build_skip_patterns_for_perf(); + + vectorized::ParseConfig no_skip_config; + no_skip_config.enable_flatten_nested = false; + no_skip_config.parse_to = vectorized::ParseConfig::ParseTo::OnlySubcolumns; + + vectorized::ParseConfig legacy_config; + legacy_config.enable_flatten_nested = false; + legacy_config.parse_to = vectorized::ParseConfig::ParseTo::OnlySubcolumns; + legacy_config.skip_path_patterns = &skip_patterns; + legacy_config.compiled_skip_matcher = nullptr; + legacy_config.skip_result_cache_capacity = 0; + legacy_config.adaptive_skip_result_cache_capacity = false; + + std::shared_ptr compiled_matcher_with_re2_set; + Status st = build_compiled_skip_matcher(skip_patterns, true, &compiled_matcher_with_re2_set); + ASSERT_TRUE(st.ok()) << st.to_string(); + + std::shared_ptr compiled_matcher_without_re2_set; + st = build_compiled_skip_matcher(skip_patterns, false, &compiled_matcher_without_re2_set); + ASSERT_TRUE(st.ok()) << st.to_string(); + + // 3) current optimization - is_skipped cache and RE2::Set both disabled. + vectorized::ParseConfig optimized_no_cache_no_re2set_config = legacy_config; + optimized_no_cache_no_re2set_config.compiled_skip_matcher = compiled_matcher_without_re2_set; + optimized_no_cache_no_re2set_config.skip_result_cache_capacity = 0; + optimized_no_cache_no_re2set_config.adaptive_skip_result_cache_capacity = false; + + // 4) current optimization - is_skipped cache disabled. + vectorized::ParseConfig optimized_no_cache_config = legacy_config; + optimized_no_cache_config.compiled_skip_matcher = compiled_matcher_with_re2_set; + optimized_no_cache_config.skip_result_cache_capacity = 0; + optimized_no_cache_config.adaptive_skip_result_cache_capacity = false; + + // 5) current optimization - RE2::Set disabled. + vectorized::ParseConfig optimized_no_re2set_config = legacy_config; + optimized_no_re2set_config.compiled_skip_matcher = compiled_matcher_without_re2_set; + optimized_no_re2set_config.skip_result_cache_capacity = 256; + optimized_no_re2set_config.adaptive_skip_result_cache_capacity = true; + + // 6) current optimization. + vectorized::ParseConfig optimized_config = legacy_config; + optimized_config.compiled_skip_matcher = compiled_matcher_with_re2_set; + optimized_config.skip_result_cache_capacity = 256; + optimized_config.adaptive_skip_result_cache_capacity = true; + + auto no_skip_result = _run_parse_perf(*json_column, no_skip_config); + auto legacy_result = _run_parse_perf(*json_column, legacy_config); + auto optimized_no_cache_no_re2set_result = + _run_parse_perf(*json_column, optimized_no_cache_no_re2set_config); + auto optimized_no_cache_result = _run_parse_perf(*json_column, optimized_no_cache_config); + auto optimized_no_re2set_result = _run_parse_perf(*json_column, optimized_no_re2set_config); + auto optimized_result = _run_parse_perf(*json_column, optimized_config); + + ASSERT_EQ(no_skip_result.column->size(), kRows); + ASSERT_EQ(legacy_result.column->size(), kRows); + ASSERT_EQ(optimized_no_cache_no_re2set_result.column->size(), kRows); + ASSERT_EQ(optimized_no_cache_result.column->size(), kRows); + ASSERT_EQ(optimized_no_re2set_result.column->size(), kRows); + ASSERT_EQ(optimized_result.column->size(), kRows); + + vectorized::DataTypeSerDe::FormatOptions options; + bool found_no_skip_difference = false; + for (size_t row = 0; row < kRows; row += 97) { + std::string no_skip_row; + std::string legacy_row; + std::string optimized_no_cache_no_re2set_row; + std::string optimized_no_cache_row; + std::string optimized_no_re2set_row; + std::string optimized_row; + no_skip_result.column->serialize_one_row_to_string(row, &no_skip_row, options); + legacy_result.column->serialize_one_row_to_string(row, &legacy_row, options); + optimized_no_cache_no_re2set_result.column->serialize_one_row_to_string( + row, &optimized_no_cache_no_re2set_row, options); + optimized_no_cache_result.column->serialize_one_row_to_string(row, &optimized_no_cache_row, + options); + optimized_no_re2set_result.column->serialize_one_row_to_string( + row, &optimized_no_re2set_row, options); + optimized_result.column->serialize_one_row_to_string(row, &optimized_row, options); + if (!found_no_skip_difference && no_skip_row != legacy_row) { + found_no_skip_difference = true; + } + ASSERT_EQ(legacy_row, optimized_no_cache_no_re2set_row) << "row=" << row; + ASSERT_EQ(legacy_row, optimized_no_cache_row) << "row=" << row; + ASSERT_EQ(legacy_row, optimized_no_re2set_row) << "row=" << row; + ASSERT_EQ(legacy_row, optimized_row) << "row=" << row; + } + ASSERT_TRUE(found_no_skip_difference) + << "no-skip output should differ from skip-enabled output on sampled rows"; + + const auto safe_speedup = [](int64_t faster, int64_t slower) -> double { + return slower > 0 ? static_cast(faster) / static_cast(slower) : 0.0; + }; + + LOG(INFO) << "skip-pattern perf matrix (" << kRows << " rows, " << kPerfNestedLeafCount + << " nested columns, same random data): " + << "no_skip_ms=" << no_skip_result.elapsed_ms << ", " + << "legacy_ms=" << legacy_result.elapsed_ms << ", " + << "opt_no_cache_no_re2set_ms=" << optimized_no_cache_no_re2set_result.elapsed_ms + << ", opt_no_cache_ms=" << optimized_no_cache_result.elapsed_ms + << ", opt_no_re2set_ms=" << optimized_no_re2set_result.elapsed_ms + << ", optimized_ms=" << optimized_result.elapsed_ms + << ", speedup_opt_no_cache_no_re2set_vs_legacy=" + << safe_speedup(legacy_result.elapsed_ms, + optimized_no_cache_no_re2set_result.elapsed_ms) + << ", speedup_opt_no_cache_vs_opt_no_cache_no_re2set=" + << safe_speedup(optimized_no_cache_no_re2set_result.elapsed_ms, + optimized_no_cache_result.elapsed_ms) + << ", speedup_optimized_vs_opt_no_re2set=" + << safe_speedup(optimized_no_re2set_result.elapsed_ms, optimized_result.elapsed_ms) + << ", speedup_optimized_vs_opt_no_cache=" + << safe_speedup(optimized_no_cache_result.elapsed_ms, optimized_result.elapsed_ms) + << ", speedup_optimized_vs_legacy=" + << safe_speedup(legacy_result.elapsed_ms, optimized_result.elapsed_ms) + << ", skip_patterns=" << skip_patterns.size(); +} + } // namespace doris::vectorized::variant_util diff --git a/be/test/olap/tablet_schema_test.cpp b/be/test/olap/tablet_schema_test.cpp index f5b53d494390ac..6ff0a24b22731f 100644 --- a/be/test/olap/tablet_schema_test.cpp +++ b/be/test/olap/tablet_schema_test.cpp @@ -102,6 +102,51 @@ TEST_F(TabletSchemaTest, test_tablet_column_init_from_thrift) { EXPECT_FALSE(tablet_column.variant_enable_typed_paths_to_sparse()); } +TEST_F(TabletSchemaTest, test_tablet_column_init_from_thrift_skip_pattern_type) { + auto check_pattern_type = [](TPatternType::type thrift_pattern_type, + PatternTypePB expected_pattern_type) { + TColumn tcolumn; + tcolumn.__set_column_name("thrift_column"); + tcolumn.__set_col_unique_id(1001); + TColumnType column_type; + column_type.__set_type(TPrimitiveType::STRING); + column_type.__set_len(255); + tcolumn.__set_column_type(column_type); + tcolumn.__set_is_key(false); + tcolumn.__set_is_allow_null(true); + tcolumn.__set_pattern_type(thrift_pattern_type); + + TabletColumn tablet_column; + tablet_column.init_from_thrift(tcolumn); + EXPECT_EQ(expected_pattern_type, tablet_column.pattern_type()); + }; + + check_pattern_type(TPatternType::SKIP_NAME, PatternTypePB::SKIP_NAME); + check_pattern_type(TPatternType::SKIP_NAME_GLOB, PatternTypePB::SKIP_NAME_GLOB); +} + +TEST_F(TabletSchemaTest, test_tablet_column_pattern_type_roundtrip_skip) { + ColumnPB column_pb; + column_pb.set_unique_id(2001); + column_pb.set_name("variant_skip_col"); + column_pb.set_type("STRING"); + column_pb.set_is_key(false); + column_pb.set_is_nullable(true); + column_pb.set_length(255); + column_pb.set_aggregation("NONE"); + column_pb.set_visible(true); + column_pb.set_pattern_type(PatternTypePB::SKIP_NAME_GLOB); + + TabletColumn tablet_column; + tablet_column.init_from_pb(column_pb); + EXPECT_EQ(PatternTypePB::SKIP_NAME_GLOB, tablet_column.pattern_type()); + + ColumnPB roundtrip_pb; + tablet_column.to_schema_pb(&roundtrip_pb); + EXPECT_TRUE(roundtrip_pb.has_pattern_type()); + EXPECT_EQ(PatternTypePB::SKIP_NAME_GLOB, roundtrip_pb.pattern_type()); +} + TEST_F(TabletSchemaTest, test_tablet_index_init_from_pb) { TabletIndexPB index_pb; index_pb.set_index_id(12345); diff --git a/be/test/vec/common/schema_util_test.cpp b/be/test/vec/common/schema_util_test.cpp index 1696130508137f..9e61dba67e175d 100644 --- a/be/test/vec/common/schema_util_test.cpp +++ b/be/test/vec/common/schema_util_test.cpp @@ -902,9 +902,9 @@ TEST_F(SchemaUtilTest, TestUpdateLeastSchemaInternal) { PathInData single_path("test_variant.c"); subcolumns_types[single_path] = {std::make_shared()}; - std::map typed_columns; - Status st = - variant_util::update_least_schema_internal(subcolumns_types, schema, 1, typed_columns); + std::map typed_path_columns; + Status st = variant_util::update_least_schema_internal(subcolumns_types, schema, 1, + typed_path_columns); EXPECT_TRUE(st.ok()); // Check results diff --git a/be/test/vec/jsonb/json_parser_test.cpp b/be/test/vec/jsonb/json_parser_test.cpp index e4790f6786c16a..a4f353fe9097f6 100644 --- a/be/test/vec/jsonb/json_parser_test.cpp +++ b/be/test/vec/jsonb/json_parser_test.cpp @@ -19,14 +19,26 @@ #include +#include +#include #include #include "common/config.h" #include "vec/common/string_ref.h" +#include "vec/common/variant_util.h" using doris::vectorized::JSONDataParser; using doris::vectorized::SimdJSONParser; using doris::vectorized::ParseConfig; +using doris::PatternTypePB; + +static std::set collect_paths(const doris::vectorized::ParseResult& result) { + std::set paths; + for (const auto& path : result.paths) { + paths.insert(path.get_path()); + } + return paths; +} TEST(JsonParserTest, ParseSimpleTypes) { JSONDataParser parser; @@ -474,3 +486,85 @@ TEST(JsonParserTest, KeyLengthLimitByConfig) { EXPECT_EQ(result->values[0].get_type(), doris::PrimitiveType::TYPE_JSONB); } } + +TEST(JsonParserTest, ParseWithSkipPatternsLegacyAndCompiledMatcher) { + JSONDataParser parser; + + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + {"debug_*", PatternTypePB::SKIP_NAME_GLOB}, + }; + + std::string json = R"({"secret":1,"debug_x":2,"keep":3})"; + ParseConfig legacy_config; + legacy_config.skip_path_patterns = &skip_patterns; + auto legacy_result = parser.parse(json.c_str(), json.size(), legacy_config); + ASSERT_TRUE(legacy_result.has_value()); + std::set legacy_paths = collect_paths(legacy_result.value()); + EXPECT_EQ(legacy_paths.find("secret"), legacy_paths.end()); + EXPECT_EQ(legacy_paths.find("debug_x"), legacy_paths.end()); + EXPECT_NE(legacy_paths.find("keep"), legacy_paths.end()); + + std::shared_ptr matcher; + auto st = doris::vectorized::variant_util::build_compiled_skip_matcher(skip_patterns, true, + &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + + ParseConfig compiled_config; + compiled_config.skip_path_patterns = &skip_patterns; + compiled_config.compiled_skip_matcher = matcher; + compiled_config.skip_result_cache_capacity = 8; + auto compiled_result = parser.parse(json.c_str(), json.size(), compiled_config); + ASSERT_TRUE(compiled_result.has_value()); + std::set compiled_paths = collect_paths(compiled_result.value()); + EXPECT_EQ(legacy_paths, compiled_paths); +} + +TEST(JsonParserTest, ParseWithInvalidSkipGlobDoesNotDropPaths) { + JSONDataParser parser; + std::vector> skip_patterns = { + {"[invalid", PatternTypePB::SKIP_NAME_GLOB}, + }; + std::string json = R"({"invalid":1,"keep":2})"; + + ParseConfig config; + config.skip_path_patterns = &skip_patterns; + auto result = parser.parse(json.c_str(), json.size(), config); + ASSERT_TRUE(result.has_value()); + std::set paths = collect_paths(result.value()); + EXPECT_NE(paths.find("invalid"), paths.end()); + EXPECT_NE(paths.find("keep"), paths.end()); + + std::shared_ptr matcher; + auto st = doris::vectorized::variant_util::build_compiled_skip_matcher(skip_patterns, true, + &matcher); + ASSERT_TRUE(st.ok()) << st.to_string(); + + ParseConfig compiled_config; + compiled_config.skip_path_patterns = &skip_patterns; + compiled_config.compiled_skip_matcher = matcher; + auto compiled_result = parser.parse(json.c_str(), json.size(), compiled_config); + ASSERT_TRUE(compiled_result.has_value()); + std::set compiled_paths = collect_paths(compiled_result.value()); + EXPECT_NE(compiled_paths.find("invalid"), compiled_paths.end()); + EXPECT_NE(compiled_paths.find("keep"), compiled_paths.end()); +} + +TEST(JsonParserTest, SkipRulesDoNotApplyInsideArrayElements) { + JSONDataParser parser; + std::vector> skip_patterns = { + {"secret", PatternTypePB::SKIP_NAME}, + }; + std::string json = R"([{"secret":1,"keep":2},{"secret":3,"keep":4}])"; + + ParseConfig config; + config.enable_flatten_nested = true; + config.skip_path_patterns = &skip_patterns; + auto result = parser.parse(json.c_str(), json.size(), config); + ASSERT_TRUE(result.has_value()); + + std::set paths = collect_paths(result.value()); + // Skip is disabled in traverseArrayElement; element object paths should remain. + EXPECT_NE(paths.find("secret"), paths.end()); + EXPECT_NE(paths.find("keep"), paths.end()); +} diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java index 3cd318aa6c5fe6..345f48cbcc70e7 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/Type.java @@ -1158,8 +1158,8 @@ public static boolean matchExactType(Type type1, Type type2, boolean ignorePreci } return true; } else if (type1.isVariantType()) { - ArrayList fields1 = ((VariantType) type1).getPredefinedFields(); - ArrayList fields2 = ((VariantType) type2).getPredefinedFields(); + ArrayList fields1 = ((VariantType) type1).getVariantTypedPathPatterns(); + ArrayList fields2 = ((VariantType) type2).getVariantTypedPathPatterns(); if (fields1.size() != fields2.size()) { return false; } diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantField.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantField.java index adb99c52cf009e..c5055026e17361 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantField.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantField.java @@ -66,15 +66,34 @@ public TPatternType getPatternType() { return patternType; } + public boolean isSkipPatternType() { + return patternType == TPatternType.SKIP_NAME || patternType == TPatternType.SKIP_NAME_GLOB; + } + + public boolean isTypedPathPatternType() { + return patternType == null + || patternType == TPatternType.MATCH_NAME + || patternType == TPatternType.MATCH_NAME_GLOB; + } + public String toSql(int depth) { StringBuilder sb = new StringBuilder(); + if (isSkipPatternType()) { + sb.append("SKIP "); + if (patternType == TPatternType.SKIP_NAME) { + sb.append("MATCH_NAME "); + } + sb.append("'").append(pattern).append("'"); + return sb.toString(); + } + if (patternType == TPatternType.MATCH_NAME) { sb.append(patternType.toString()).append(" "); } sb.append("'").append(pattern).append("'"); sb.append(":").append(type.toSql(depth + 1)); - if (!comment.isEmpty()) { + if (comment != null && !comment.isEmpty()) { sb.append(" COMMENT '").append(comment).append("'"); } return sb.toString(); @@ -98,6 +117,9 @@ public String prettyPrint(int lpad) { } public boolean matchesField(VariantField f) { + if (!isTypedPathPatternType() || !f.isTypedPathPatternType()) { + return false; + } if (equals(f)) { return true; } @@ -114,7 +136,9 @@ public boolean equals(Object other) { return false; } VariantField otherFiled = (VariantField) other; - return otherFiled.pattern.equals(pattern) && otherFiled.type.equals(type); + return otherFiled.pattern.equals(pattern) + && otherFiled.type.equals(type) + && otherFiled.patternType == patternType; } @Override diff --git a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java index da062d21489f66..7c1fca95f822d7 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java +++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java @@ -38,7 +38,7 @@ public class VariantType extends ScalarType { private final HashMap fieldMap = Maps.newHashMap(); @SerializedName(value = "fields") - private final ArrayList predefinedFields; + private final ArrayList variantPathPatterns; @SerializedName(value = "variantMaxSubcolumnsCount") private final int variantMaxSubcolumnsCount; @@ -65,7 +65,7 @@ public class VariantType extends ScalarType { public VariantType() { super(PrimitiveType.VARIANT); - this.predefinedFields = Lists.newArrayList(); + this.variantPathPatterns = Lists.newArrayList(); this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; @@ -78,10 +78,8 @@ public VariantType() { public VariantType(ArrayList fields) { super(PrimitiveType.VARIANT); Preconditions.checkNotNull(fields); - this.predefinedFields = fields; - for (VariantField predefinedField : this.predefinedFields) { - fieldMap.put(predefinedField.getPattern(), predefinedField); - } + this.variantPathPatterns = fields; + addTypedPathPatternsToFieldMap(); this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; @@ -93,7 +91,7 @@ public VariantType(ArrayList fields) { public VariantType(Map properties) { super(PrimitiveType.VARIANT); - this.predefinedFields = Lists.newArrayList(); + this.variantPathPatterns = Lists.newArrayList(); this.properties = properties; this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; @@ -107,10 +105,8 @@ public VariantType(Map properties) { public VariantType(ArrayList fields, Map properties) { super(PrimitiveType.VARIANT); Preconditions.checkNotNull(fields); - this.predefinedFields = fields; - for (VariantField predefinedField : this.predefinedFields) { - fieldMap.put(predefinedField.getPattern(), predefinedField); - } + this.variantPathPatterns = fields; + addTypedPathPatternsToFieldMap(); this.properties = properties; this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; @@ -121,7 +117,8 @@ public VariantType(ArrayList fields, Map propertie this.variantDocShardCount = 64; } - public VariantType(ArrayList fields, int variantMaxSubcolumnsCount, + public VariantType(ArrayList variantPathPatterns, + int variantMaxSubcolumnsCount, boolean enableTypedPathsToSparse, int variantMaxSparseColumnStatisticsSize, int variantSparseHashShardCount, @@ -129,11 +126,9 @@ public VariantType(ArrayList fields, int variantMaxSubcolumnsCount long variantDocMaterializationMinRows, int variantDocShardCount) { super(PrimitiveType.VARIANT); - Preconditions.checkNotNull(fields); - this.predefinedFields = fields; - for (VariantField predefinedField : this.predefinedFields) { - fieldMap.put(predefinedField.getPattern(), predefinedField); - } + Preconditions.checkNotNull(variantPathPatterns); + this.variantPathPatterns = variantPathPatterns; + addTypedPathPatternsToFieldMap(); this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; this.enableTypedPathsToSparse = enableTypedPathsToSparse; this.variantMaxSparseColumnStatisticsSize = variantMaxSparseColumnStatisticsSize; @@ -143,13 +138,21 @@ public VariantType(ArrayList fields, int variantMaxSubcolumnsCount this.variantDocShardCount = variantDocShardCount; } + private void addTypedPathPatternsToFieldMap() { + for (VariantField pathPattern : variantPathPatterns) { + if (pathPattern.isTypedPathPatternType()) { + fieldMap.put(pathPattern.getPattern(), pathPattern); + } + } + } + @Override public String toSql(int depth) { StringBuilder sb = new StringBuilder(); sb.append("variant"); sb.append("<"); - if (!predefinedFields.isEmpty()) { - sb.append(predefinedFields.stream() + if (!variantPathPatterns.isEmpty()) { + sb.append(variantPathPatterns.stream() .map(variantField -> variantField.toSql(depth)).collect(Collectors.joining(","))); sb.append(","); } @@ -181,8 +184,18 @@ public String toSql(int depth) { return sb.toString(); } - public ArrayList getPredefinedFields() { - return predefinedFields; + public ArrayList getVariantPathPatterns() { + return variantPathPatterns; + } + + public ArrayList getVariantTypedPathPatterns() { + ArrayList typedPathPatterns = Lists.newArrayList(); + for (VariantField variantPathPattern : variantPathPatterns) { + if (variantPathPattern.isTypedPathPatternType()) { + typedPathPatterns.add(variantPathPattern); + } + } + return typedPathPatterns; } @Override @@ -214,7 +227,7 @@ public boolean equals(Object other) { return false; } VariantType otherVariantType = (VariantType) other; - return Objects.equals(otherVariantType.getPredefinedFields(), predefinedFields) + return Objects.equals(otherVariantType.getVariantPathPatterns(), variantPathPatterns) && variantMaxSubcolumnsCount == otherVariantType.variantMaxSubcolumnsCount && enableTypedPathsToSparse == otherVariantType.enableTypedPathsToSparse && enableVariantDocMode == otherVariantType.enableVariantDocMode diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index 12a73beb2c5559..33106fa3406049 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -517,6 +517,7 @@ SHAPE: 'SHAPE'; SHOW: 'SHOW'; SIGNED: 'SIGNED'; SKEW: 'SKEW'; +SKIP_: 'SKIP'; SMALLINT: 'SMALLINT'; SNAPSHOT: 'SNAPSHOT'; SNAPSHOTS: 'SNAPSHOTS'; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index be2c75019006a4..076fec93af498d 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -1874,6 +1874,7 @@ variantSubColTypeList ; variantSubColType : variantSubColMatchType? STRING_LITERAL COLON dataType commentSpec? + | SKIP_ variantSubColMatchType? STRING_LITERAL ; variantSubColMatchType : (MATCH_NAME | MATCH_NAME_GLOB) @@ -2247,6 +2248,7 @@ nonReserved | SESSION_USER | SHAPE | SKEW + | SKIP_ | SNAPSHOT | SNAPSHOTS | SONAME diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java index 457259856cfb9f..0cf3e63edecda4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java @@ -108,6 +108,10 @@ public class Column implements GsonPostProcessable { @SerializedName(value = "comment") private String comment; @SerializedName(value = "children") + // Generic sub-columns for complex types. + // For VARIANT, this list stores both typed-path templates and skip rules. + // Caller should filter by fieldPatternType: + // MATCH_* -> typed path, SKIP_* -> skip pattern. private List children; /** * This is similar as `defaultValue`. Differences are: @@ -355,9 +359,10 @@ public void createChildrenColumn(Type type, Column column) { column.addChildrenColumn(c); } } else if (type.isVariantType() && type instanceof VariantType) { - // variant may contain predefined structured fields - ArrayList fields = ((VariantType) type).getPredefinedFields(); - for (VariantField field : fields) { + // Variant stores typed-path templates and skip patterns as sibling children, + // distinguished by fieldPatternType. + ArrayList variantPathPatterns = ((VariantType) type).getVariantPathPatterns(); + for (VariantField field : variantPathPatterns) { // set column name as pattern Column c = new Column(field.pattern, field.getType()); c.setIsAllowNull(true); @@ -371,6 +376,22 @@ public List getChildren() { return children; } + public List getVariantTypedPathChildrenOrEmpty() { + if (!(type instanceof VariantType)) { + return Lists.newArrayList(); + } + if (CollectionUtils.isEmpty(children)) { + return Lists.newArrayList(); + } + List typedPathChildren = Lists.newArrayListWithCapacity(children.size()); + for (Column child : children) { + if (isVariantTypedPathPatternType(child.fieldPatternType)) { + typedPathChildren.add(child); + } + } + return typedPathChildren; + } + private void addChildrenColumn(Column column) { if (this.children == null) { this.children = Lists.newArrayListWithExpectedSize(2); @@ -697,23 +718,86 @@ private void setChildrenTColumn(Column children, TColumn tColumn) { toChildrenThrift(children, childrenTColumn); } - private void addChildren(Column column, TColumn tColumn) { - if (column.getChildren() != null) { - List childrenColumns = column.getChildren(); - tColumn.setChildrenColumn(new ArrayList<>()); - for (Column c : childrenColumns) { - setChildrenTColumn(c, tColumn); - } + private void appendVariantTypedPathChildren(Column column, TColumn tColumn) { + List typedPathChildren = column.getVariantTypedPathChildrenOrEmpty(); + if (typedPathChildren.isEmpty()) { + return; + } + ensureChildrenColumnInitialized(tColumn); + for (Column typedPathChild : typedPathChildren) { + setChildrenTColumn(typedPathChild, tColumn); } } - private void addChildren(OlapFile.ColumnPB.Builder builder) throws DdlException { - if (this.getChildren() != null) { - List childrenColumns = this.getChildren(); - for (Column c : childrenColumns) { - builder.addChildrenColumns(c.toPb(Sets.newHashSet(), Lists.newArrayList())); + private void appendVariantTypedPathChildren(OlapFile.ColumnPB.Builder builder) throws DdlException { + List typedPathChildren = getVariantTypedPathChildrenOrEmpty(); + if (typedPathChildren.isEmpty()) { + return; + } + for (Column typedPathChild : typedPathChildren) { + builder.addChildrenColumns(typedPathChild.toPb(Sets.newHashSet(), Lists.newArrayList())); + } + } + + private static PatternTypePB toPatternTypeForColumnPb(TPatternType patternType) { + if (patternType == null) { + return PatternTypePB.MATCH_NAME_GLOB; + } + PatternTypePB patternTypePb = PatternTypePB.forNumber(patternType.getValue()); + if (patternTypePb == null) { + throw new IllegalArgumentException("Unknown pattern type: " + patternType); + } + return patternTypePb; + } + + private static boolean isVariantTypedPathPatternType(TPatternType patternType) { + return patternType == null + || patternType == TPatternType.MATCH_NAME + || patternType == TPatternType.MATCH_NAME_GLOB; + } + + private static boolean isVariantSkipPatternType(TPatternType patternType) { + return patternType == TPatternType.SKIP_NAME || patternType == TPatternType.SKIP_NAME_GLOB; + } + + public List getVariantSkipPatternChildrenOrEmpty() { + if (!(type instanceof VariantType) || CollectionUtils.isEmpty(children)) { + return Lists.newArrayList(); + } + List skipPatternChildren = Lists.newArrayList(); + for (Column child : children) { + if (isVariantSkipPatternType(child.fieldPatternType)) { + skipPatternChildren.add(child); } } + return skipPatternChildren; + } + + private void appendVariantSkipPatternChildren(Column column, TColumn tColumn) { + List skipPatternChildren = column.getVariantSkipPatternChildrenOrEmpty(); + if (skipPatternChildren.isEmpty()) { + return; + } + ensureChildrenColumnInitialized(tColumn); + for (Column skipPatternChild : skipPatternChildren) { + setChildrenTColumn(skipPatternChild, tColumn); + } + } + + private void appendVariantSkipPatternChildren(OlapFile.ColumnPB.Builder builder) throws DdlException { + List skipPatternChildren = getVariantSkipPatternChildrenOrEmpty(); + if (skipPatternChildren.isEmpty()) { + return; + } + for (Column skipPatternChild : skipPatternChildren) { + builder.addChildrenColumns(skipPatternChild.toPb(Sets.newHashSet(), Lists.newArrayList())); + } + } + + private static void ensureChildrenColumnInitialized(TColumn tColumn) { + if (tColumn.children_column == null) { + tColumn.setChildrenColumn(new ArrayList<>()); + } } private void toChildrenThrift(Column column, TColumn tColumn) { @@ -734,8 +818,10 @@ private void toChildrenThrift(Column column, TColumn tColumn) { setChildrenTColumn(children, tColumn); } } else if (column.type.isVariantType()) { - // variant may contain predefined structured fields - addChildren(column, tColumn); + // Variant children are persisted as two peer groups: + // 1) typed path schema templates, 2) skip pattern rules. + appendVariantTypedPathChildren(column, tColumn); + appendVariantSkipPatternChildren(column, tColumn); } } @@ -819,11 +905,7 @@ public OlapFile.ColumnPB toPb(Set bfColumns, List indexes) throws builder.setType(this.getDataType().toThrift().name()); builder.setIsKey(this.isKey); if (fieldPatternType != null) { - if (fieldPatternType == TPatternType.MATCH_NAME) { - builder.setPatternType(PatternTypePB.MATCH_NAME); - } else { - builder.setPatternType(PatternTypePB.MATCH_NAME_GLOB); - } + builder.setPatternType(toPatternTypeForColumnPb(fieldPatternType)); } if (null != this.aggregationType) { if (type.isAggStateType()) { @@ -886,8 +968,9 @@ public OlapFile.ColumnPB toPb(Set bfColumns, List indexes) throws builder.setVariantEnableDocMode(this.getVariantEnableDocMode()); builder.setVariantDocMaterializationMinRows(this.getvariantDocMaterializationMinRows()); builder.setVariantDocHashShardCount(this.getVariantDocShardCount()); - // variant may contain predefined structured fields - addChildren(builder); + // Keep typed paths and skip rules as sibling children entries. + appendVariantTypedPathChildren(builder); + appendVariantSkipPatternChildren(builder); } OlapFile.ColumnPB col = builder.build(); @@ -977,9 +1060,14 @@ public void checkSchemaChangeAllowed(Column other) throws DdlException { if (this.getVariantDocShardCount() != other.getVariantDocShardCount()) { throw new DdlException("Can not change variant doc snapshot shard count"); } - if (CollectionUtils.isNotEmpty(this.getChildren()) || CollectionUtils.isNotEmpty(other.getChildren())) { + if (CollectionUtils.isNotEmpty(this.getVariantTypedPathChildrenOrEmpty()) + || CollectionUtils.isNotEmpty(other.getVariantTypedPathChildrenOrEmpty())) { throw new DdlException("Can not change variant schema templates"); } + if (CollectionUtils.isNotEmpty(this.getVariantSkipPatternChildrenOrEmpty()) + || CollectionUtils.isNotEmpty(other.getVariantSkipPatternChildrenOrEmpty())) { + throw new DdlException("Can not change variant skip patterns"); + } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 07c550a639721d..e1c9b96ed13a28 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -3743,25 +3743,23 @@ public Index getInvertedIndex(Column column, List subPath, String analyz : filteredInvertedIndexes.stream().filter(Index::isAnalyzedInvertedIndex).findFirst().orElse(null); } - // subPath is not empty, means it is a variant column, find the field pattern from children + // subPath is not empty, means it is a variant column, find the field pattern from typed-path templates String subPathString = String.join(".", subPath); String fieldPattern = ""; - if (column.getChildren() != null) { - for (Column child : column.getChildren()) { - String childName = child.getName(); - if (child.getFieldPatternType() == TPatternType.MATCH_NAME_GLOB) { - try { - com.google.re2j.Pattern compiled = GlobRegexUtil.getOrCompilePattern(childName); - if (compiled.matcher(subPathString).matches()) { - fieldPattern = childName; - } - } catch (com.google.re2j.PatternSyntaxException | IllegalArgumentException e) { - continue; - } - } else if (child.getFieldPatternType() == TPatternType.MATCH_NAME) { - if (childName.equals(subPathString)) { + for (Column child : column.getVariantTypedPathChildrenOrEmpty()) { + String childName = child.getName(); + if (child.getFieldPatternType() == TPatternType.MATCH_NAME_GLOB) { + try { + com.google.re2j.Pattern compiled = GlobRegexUtil.getOrCompilePattern(childName); + if (compiled.matcher(subPathString).matches()) { fieldPattern = childName; } + } catch (com.google.re2j.PatternSyntaxException | IllegalArgumentException e) { + continue; + } + } else if (child.getFieldPatternType() == TPatternType.MATCH_NAME) { + if (childName.equals(subPathString)) { + fieldPattern = childName; } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 82314c719a0dbc..5982889874e6c9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -477,7 +477,6 @@ import org.apache.doris.nereids.DorisParser.VariantContext; import org.apache.doris.nereids.DorisParser.VariantPredefinedFieldsContext; import org.apache.doris.nereids.DorisParser.VariantSubColTypeContext; -import org.apache.doris.nereids.DorisParser.VariantSubColTypeListContext; import org.apache.doris.nereids.DorisParser.VariantTypeDefinitionsContext; import org.apache.doris.nereids.DorisParser.WhereClauseContext; import org.apache.doris.nereids.DorisParser.WindowFrameContext; @@ -1061,6 +1060,7 @@ import org.apache.doris.nereids.types.DateV2Type; import org.apache.doris.nereids.types.LargeIntType; import org.apache.doris.nereids.types.MapType; +import org.apache.doris.nereids.types.StringType; import org.apache.doris.nereids.types.StructField; import org.apache.doris.nereids.types.StructType; import org.apache.doris.nereids.types.VarcharType; @@ -5100,8 +5100,23 @@ public DataType visitVariantPredefinedFields(VariantPredefinedFieldsContext ctx) "Unsupported variant definition: " + variantDef.getText()); VariantContext variantCtx = (VariantContext) variantDef; - List fields = variantCtx.variantSubColTypeList() != null - ? visitVariantSubColTypeList(variantCtx.variantSubColTypeList()) : Lists.newArrayList(); + List variantPathPatterns = Lists.newArrayList(); + if (variantCtx.variantSubColTypeList() != null) { + for (VariantSubColTypeContext subCtx : variantCtx.variantSubColTypeList().variantSubColType()) { + if (subCtx.SKIP_() != null) { + String skipPattern = subCtx.STRING_LITERAL().getText(); + skipPattern = skipPattern.substring(1, skipPattern.length() - 1); + String skipMatchType = subCtx.variantSubColMatchType() == null + ? null + : subCtx.variantSubColMatchType().getText(); + String skipPatternType = "MATCH_NAME".equalsIgnoreCase(skipMatchType) + ? "SKIP_NAME" : "SKIP_NAME_GLOB"; + variantPathPatterns.add(new VariantField(skipPattern, StringType.INSTANCE, "", skipPatternType)); + } else { + variantPathPatterns.add(visitVariantSubColType(subCtx)); + } + } + } Map properties = variantCtx.properties != null ? Maps.newHashMap(visitPropertyClause(variantCtx.properties)) : Maps.newHashMap(); @@ -5153,7 +5168,10 @@ public DataType visitVariantPredefinedFields(VariantPredefinedFieldsContext ctx) variantSparseHashShardCount = 0; // Validate that all typed fields use data types supported in doc mode // document mode only supports string, integral, float, and boolean types - for (VariantField field : fields) { + for (VariantField field : variantPathPatterns) { + if (field.isSkipPatternType()) { + continue; + } DataType dataType = field.getDataType(); if (dataType.isArrayType()) { ArrayType arrayType = (ArrayType) dataType; @@ -5184,7 +5202,7 @@ public DataType visitVariantPredefinedFields(VariantPredefinedFieldsContext ctx) + " and " + PropertyAnalyzer.PROPERTIES_VARIANT_DOC_HASH_SHARD_COUNT); } - return new VariantType(fields, variantMaxSubcolumnsCount, enableTypedPathsToSparse, + return new VariantType(variantPathPatterns, variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocHashShardCount); } @@ -5193,12 +5211,6 @@ private static boolean isSupportedVariantDocModeType(DataType type) { return type.isStringLikeType() || type.isIntegralType() || type.isFloatLikeType() || type.isBooleanType(); } - @Override - public List visitVariantSubColTypeList(VariantSubColTypeListContext ctx) { - return ctx.variantSubColType().stream().map( - this::visitVariantSubColType).collect(ImmutableList.toImmutableList()); - } - @Override public VariantField visitVariantSubColType(VariantSubColTypeContext ctx) { String comment; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java index dd715c3c54d065..26563c33d9466f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/ElementAt.java @@ -104,7 +104,7 @@ public FunctionSignature computeSignature(FunctionSignature signature) { DataType expressionType = arguments.get(0).getDataType(); DataType sigType = signature.argumentsTypes.get(0); if (expressionType instanceof VariantType && sigType instanceof VariantType) { - // Preserve predefinedFields for schema template matching + // Preserve variant typed path patterns for schema template matching. VariantType originalType = (VariantType) expressionType; signature = signature.withArgumentType(0, originalType); signature = signature.withReturnType(originalType); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java index 0881884105f4a8..fff54f8244c0e0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateTableInfo.java @@ -1407,8 +1407,8 @@ private void columnToIndexesCheck() { } boolean findFieldPattern = false; VariantType variantType = (VariantType) column.getType(); - List predefinedFields = variantType.getPredefinedFields(); - for (VariantField field : predefinedFields) { + List typedPathPatterns = variantType.getVariantTypedPathPatterns(); + for (VariantField field : typedPathPatterns) { if (field.getPattern().equals(fieldPattern)) { findFieldPattern = true; if (!IndexDefinition.isSupportIdxType(field.getDataType())) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java index 911dc2e4e2cd51..16c1374c0b7b4b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java @@ -480,12 +480,12 @@ public static DataType fromCatalogType(Type type) { // In the past, variant metadata used the ScalarType type. // Now, we use VariantType, which inherits from ScalarType, as the new metadata storage. if (type instanceof org.apache.doris.catalog.VariantType) { - List variantFields = ((org.apache.doris.catalog.VariantType) type) - .getPredefinedFields().stream() + List variantPathPatterns = ((org.apache.doris.catalog.VariantType) type) + .getVariantPathPatterns().stream() .map(cf -> new VariantField(cf.getPattern(), fromCatalogType(cf.getType()), cf.getComment() == null ? "" : cf.getComment(), cf.getPatternType().toString())) .collect(ImmutableList.toImmutableList()); - return new VariantType(variantFields, + return new VariantType(variantPathPatterns, ((org.apache.doris.catalog.VariantType) type).getVariantMaxSubcolumnsCount(), ((org.apache.doris.catalog.VariantType) type).getEnableTypedPathsToSparse(), ((org.apache.doris.catalog.VariantType) type).getVariantMaxSparseColumnStatisticsSize(), @@ -1113,10 +1113,10 @@ private static void validateScalarType(ScalarType scalarType) { break; } case VARIANT: { - ArrayList predefinedFields = - ((org.apache.doris.catalog.VariantType) scalarType).getPredefinedFields(); + ArrayList typedPathPatterns = + ((org.apache.doris.catalog.VariantType) scalarType).getVariantTypedPathPatterns(); Set fieldPatterns = new HashSet<>(); - for (org.apache.doris.catalog.VariantField field : predefinedFields) { + for (org.apache.doris.catalog.VariantField field : typedPathPatterns) { Type fieldType = field.getType(); validateNestedType(scalarType, fieldType); if (!fieldPatterns.add(field.getPattern())) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java index a8e3bd9ded136b..8bdf96cf1a13f0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantField.java @@ -53,6 +53,10 @@ public VariantField(String pattern, DataType dataType, String comment, String pa TPatternType type; if (TPatternType.MATCH_NAME.name().equalsIgnoreCase(patternType)) { type = TPatternType.MATCH_NAME; + } else if (TPatternType.SKIP_NAME.name().equalsIgnoreCase(patternType)) { + type = TPatternType.SKIP_NAME; + } else if (TPatternType.SKIP_NAME_GLOB.name().equalsIgnoreCase(patternType)) { + type = TPatternType.SKIP_NAME_GLOB; } else { type = TPatternType.MATCH_NAME_GLOB; } @@ -71,6 +75,14 @@ public String getComment() { return comment; } + public boolean isSkipPatternType() { + return patternType == TPatternType.SKIP_NAME || patternType == TPatternType.SKIP_NAME_GLOB; + } + + public boolean isTypedPathPatternType() { + return patternType == TPatternType.MATCH_NAME || patternType == TPatternType.MATCH_NAME_GLOB; + } + /** * Check if the given field name matches this field's pattern. * This method uses a restricted glob syntax converted to regex. @@ -86,6 +98,9 @@ public String getComment() { * @return true if the field name matches the pattern */ public boolean matches(String fieldName) { + if (!isTypedPathPatternType()) { + return false; + } if (patternType == TPatternType.MATCH_NAME) { return pattern.equals(fieldName); } @@ -111,6 +126,14 @@ public org.apache.doris.catalog.VariantField toCatalogDataType() { */ public String toSql() { StringBuilder sb = new StringBuilder(); + if (isSkipPatternType()) { + sb.append("SKIP "); + if (patternType == TPatternType.SKIP_NAME) { + sb.append("MATCH_NAME "); + } + sb.append("'").append(pattern).append("'"); + return sb.toString(); + } if (patternType == TPatternType.MATCH_NAME) { sb.append(patternType.toString()).append(" "); } @@ -137,12 +160,12 @@ public boolean equals(Object o) { } VariantField that = (VariantField) o; return Objects.equals(pattern, that.pattern) && Objects.equals(dataType, - that.dataType); + that.dataType) && patternType == that.patternType; } @Override public int hashCode() { - return Objects.hash(pattern, dataType); + return Objects.hash(pattern, dataType, patternType); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java index af25e1f9061f2f..0fe92b8c188376 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java @@ -49,7 +49,7 @@ public class VariantType extends PrimitiveType { private final int variantMaxSparseColumnStatisticsSize; - private final List predefinedFields; + private final List variantPathPatterns; private final int variantSparseHashShardCount; private final boolean enableVariantDocMode; @@ -57,13 +57,13 @@ public class VariantType extends PrimitiveType { private final int variantDocShardCount; /** - * Creates a Variant type without predefined fields and only configures the max subcolumn limit. + * Creates a Variant type without variant path patterns and only configures the max subcolumn limit. * * @param variantMaxSubcolumnsCount max number of subcolumns allowed (0 means unlimited) */ public VariantType(int variantMaxSubcolumnsCount) { this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; - this.predefinedFields = Lists.newArrayList(); + this.variantPathPatterns = Lists.newArrayList(); this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; this.variantSparseHashShardCount = 0; @@ -73,10 +73,10 @@ public VariantType(int variantMaxSubcolumnsCount) { } /** - * Contains predefined fields like struct + * Variant path patterns, including typed paths and skip rules. */ public VariantType(List fields) { - this.predefinedFields = ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be null")); + this.variantPathPatterns = ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be null")); this.variantMaxSubcolumnsCount = 0; this.enableTypedPathsToSparse = false; this.variantMaxSparseColumnStatisticsSize = 10000; @@ -87,9 +87,9 @@ public VariantType(List fields) { } /** - * Creates a Variant type with predefined fields and advanced optional properties. + * Creates a Variant type with variant path patterns and advanced optional properties. * - * @param fields predefined variant path fields + * @param variantPathPatterns variant path patterns, including typed paths and skip rules * @param variantMaxSubcolumnsCount max number of subcolumns allowed * @param enableTypedPathsToSparse whether typed paths should be materialized as sparse columns * @param variantMaxSparseColumnStatisticsSize upper bound of sparse path statistics entries @@ -97,11 +97,13 @@ public VariantType(List fields) { * @param enableVariantDocMode whether to enable variant doc snapshot writing mode * @param variantDocMaterializationMinRows minimum rows to generate doc snapshot columns */ - public VariantType(List fields, int variantMaxSubcolumnsCount, + public VariantType(List variantPathPatterns, + int variantMaxSubcolumnsCount, boolean enableTypedPathsToSparse, int variantMaxSparseColumnStatisticsSize, int variantSparseHashShardCount, boolean enableVariantDocMode, long variantDocMaterializationMinRows, int variantDocShardCount) { - this.predefinedFields = ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be null")); + this.variantPathPatterns = ImmutableList.copyOf( + Objects.requireNonNull(variantPathPatterns, "variantPathPatterns should not be null")); this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount; this.enableTypedPathsToSparse = enableTypedPathsToSparse; this.variantMaxSparseColumnStatisticsSize = variantMaxSparseColumnStatisticsSize; @@ -113,8 +115,9 @@ public VariantType(List fields, int variantMaxSubcolumnsCount, @Override public DataType conversion() { - return new VariantType(predefinedFields.stream().map(VariantField::conversion) - .collect(Collectors.toList()), variantMaxSubcolumnsCount, enableTypedPathsToSparse, + return new VariantType(variantPathPatterns.stream().map(VariantField::conversion) + .collect(Collectors.toList()), + variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocShardCount); @@ -122,9 +125,11 @@ public DataType conversion() { @Override public Type toCatalogDataType() { - org.apache.doris.catalog.VariantType type = new org.apache.doris.catalog.VariantType(predefinedFields.stream() + org.apache.doris.catalog.VariantType type = + new org.apache.doris.catalog.VariantType(variantPathPatterns.stream() .map(VariantField::toCatalogDataType) - .collect(Collectors.toCollection(ArrayList::new)), variantMaxSubcolumnsCount, enableTypedPathsToSparse, + .collect(Collectors.toCollection(ArrayList::new)), + variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocShardCount); return type; @@ -140,8 +145,8 @@ public String toSql() { StringBuilder sb = new StringBuilder(); sb.append("variant"); sb.append("<"); - if (!predefinedFields.isEmpty()) { - sb.append(predefinedFields.stream().map(VariantField::toSql).collect(Collectors.joining(","))); + if (!variantPathPatterns.isEmpty()) { + sb.append(variantPathPatterns.stream().map(VariantField::toSql).collect(Collectors.joining(","))); sb.append(","); } @@ -187,7 +192,7 @@ public boolean equals(Object o) { && this.enableTypedPathsToSparse == other.enableTypedPathsToSparse && this.enableVariantDocMode == other.enableVariantDocMode && this.variantDocMaterializationMinRows == other.variantDocMaterializationMinRows - && Objects.equals(predefinedFields, other.predefinedFields); + && Objects.equals(variantPathPatterns, other.variantPathPatterns); } @Override @@ -199,12 +204,14 @@ public boolean equalsForRecursiveCte(Object o) { return false; } VariantType other = (VariantType) o; - if (predefinedFields.size() != other.predefinedFields.size()) { + List typedPathPatterns = getVariantTypedPathPatterns(); + List otherTypedPathPatterns = other.getVariantTypedPathPatterns(); + if (typedPathPatterns.size() != otherTypedPathPatterns.size()) { return false; } - for (int i = 0; i < predefinedFields.size(); ++i) { - if (!predefinedFields.get(i).getDataType() - .equalsForRecursiveCte(other.predefinedFields.get(i).getDataType())) { + for (int i = 0; i < typedPathPatterns.size(); ++i) { + if (!typedPathPatterns.get(i).getDataType() + .equalsForRecursiveCte(otherTypedPathPatterns.get(i).getDataType())) { return false; } } @@ -216,7 +223,7 @@ public int hashCode() { return Objects.hash(super.hashCode(), variantMaxSubcolumnsCount, enableTypedPathsToSparse, variantMaxSparseColumnStatisticsSize, variantSparseHashShardCount, enableVariantDocMode, variantDocMaterializationMinRows, variantDocShardCount, - predefinedFields); + variantPathPatterns); } @Override @@ -229,8 +236,14 @@ public String toString() { return toSql(); } - public List getPredefinedFields() { - return predefinedFields; + public List getVariantPathPatterns() { + return variantPathPatterns; + } + + public List getVariantTypedPathPatterns() { + return variantPathPatterns.stream() + .filter(VariantField::isTypedPathPatternType) + .collect(ImmutableList.toImmutableList()); } /** @@ -241,7 +254,7 @@ public List getPredefinedFields() { * @return Optional containing the matching VariantField, or empty if no match */ public Optional findMatchingField(String fieldName) { - for (VariantField field : predefinedFields) { + for (VariantField field : getVariantTypedPathPatterns()) { if (field.matches(fieldName)) { return Optional.of(field); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index 5470c83222fa66..472299f64a9155 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -394,14 +394,14 @@ private Pair getColumnType(List typeNodes, int start) } else if (tPrimitiveType == TPrimitiveType.VARIANT) { // Preserve VARIANT-specific properties from PTypeNode, especially variant_max_subcolumns_count. int maxSubcolumns = typeNode.getVariantMaxSubcolumnsCount(); - // Currently no predefined fields are carried in PTypeNode for VARIANT, so use empty list and default + // Currently no variant path patterns are carried in PTypeNode for VARIANT, so use empty list and default // values for other properties. type = new VariantType(new ArrayList<>(), maxSubcolumns, /*enableTypedPathsToSparse*/ false, /*variantMaxSparseColumnStatisticsSize*/ 10000, /*variantSparseHashShardCount*/ 0, /*variantEnableDocMode*/ false, - /*variantDocMaterializationMinRows*/ 0, + /*variantDocMaterializationMinRows*/ 0L, /*variantDocShardCount*/ 0); parsedNodes = 1; } else { @@ -555,4 +555,3 @@ public void checkAuth(ConnectContext ctx) { } } } - diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/ColumnTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/ColumnTest.java index cde493d4adb409..6fccdec7a71884 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/ColumnTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/ColumnTest.java @@ -28,7 +28,12 @@ import org.apache.doris.common.io.Text; import org.apache.doris.common.jmockit.Deencapsulation; import org.apache.doris.persist.gson.GsonUtils; +import org.apache.doris.proto.OlapFile; +import org.apache.doris.thrift.TColumn; +import org.apache.doris.thrift.TPatternType; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -37,6 +42,7 @@ import java.io.DataOutputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; public class ColumnTest { @@ -173,4 +179,61 @@ public void testBaseColumn() { mvColumnComplex.setDefineExpr(add); Assert.assertTrue(mvColumnComplex.tryGetBaseColumnName().equalsIgnoreCase("mv_b")); } + + @Test + public void testVariantSkipPatternChildrenSerialization() throws Exception { + ArrayList variantPathPatterns = new ArrayList<>(); + // Deliberately interleave skip and typed paths to verify grouped output order. + variantPathPatterns.add(new VariantField("debug_*", Type.STRING, "", TPatternType.SKIP_NAME_GLOB)); + variantPathPatterns.add(new VariantField("num_*", Type.BIGINT, "", TPatternType.MATCH_NAME_GLOB)); + variantPathPatterns.add(new VariantField("secret", Type.STRING, "", TPatternType.SKIP_NAME)); + variantPathPatterns.add(new VariantField("id", Type.INT, "", TPatternType.MATCH_NAME)); + VariantType variantType = new VariantType(variantPathPatterns); + + Column variantColumn = new Column("v", variantType, true); + Assert.assertEquals(4, variantColumn.getChildren().size()); + Assert.assertEquals(2, variantColumn.getVariantTypedPathChildrenOrEmpty().size()); + Assert.assertEquals(2, variantColumn.getVariantSkipPatternChildrenOrEmpty().size()); + + TColumn thriftColumn = variantColumn.toThrift(); + Assert.assertNotNull(thriftColumn.getChildrenColumn()); + Assert.assertEquals(4, thriftColumn.getChildrenColumnSize()); + Assert.assertEquals("num_*", thriftColumn.getChildrenColumn().get(0).getColumnName()); + Assert.assertEquals(TPatternType.MATCH_NAME_GLOB, thriftColumn.getChildrenColumn().get(0).getPatternType()); + Assert.assertEquals("id", thriftColumn.getChildrenColumn().get(1).getColumnName()); + Assert.assertEquals(TPatternType.MATCH_NAME, thriftColumn.getChildrenColumn().get(1).getPatternType()); + Assert.assertEquals("debug_*", thriftColumn.getChildrenColumn().get(2).getColumnName()); + Assert.assertEquals(TPatternType.SKIP_NAME_GLOB, thriftColumn.getChildrenColumn().get(2).getPatternType()); + Assert.assertEquals("secret", thriftColumn.getChildrenColumn().get(3).getColumnName()); + Assert.assertEquals(TPatternType.SKIP_NAME, thriftColumn.getChildrenColumn().get(3).getPatternType()); + + OlapFile.ColumnPB pbColumn = variantColumn.toPb(Sets.newHashSet(), Lists.newArrayList()); + Assert.assertEquals(4, pbColumn.getChildrenColumnsCount()); + Assert.assertEquals("num_*", pbColumn.getChildrenColumns(0).getName()); + Assert.assertEquals(OlapFile.PatternTypePB.MATCH_NAME_GLOB, + pbColumn.getChildrenColumns(0).getPatternType()); + Assert.assertEquals("id", pbColumn.getChildrenColumns(1).getName()); + Assert.assertEquals(OlapFile.PatternTypePB.MATCH_NAME, pbColumn.getChildrenColumns(1).getPatternType()); + Assert.assertEquals("debug_*", pbColumn.getChildrenColumns(2).getName()); + Assert.assertEquals(OlapFile.PatternTypePB.SKIP_NAME_GLOB, + pbColumn.getChildrenColumns(2).getPatternType()); + Assert.assertEquals("secret", pbColumn.getChildrenColumns(3).getName()); + Assert.assertEquals(OlapFile.PatternTypePB.SKIP_NAME, pbColumn.getChildrenColumns(3).getPatternType()); + } + + @Test + public void testVariantSchemaChangeRejectsSkipPatternMutation() { + ArrayList oldPatterns = new ArrayList<>(); + oldPatterns.add(new VariantField("secret", Type.STRING, "", TPatternType.SKIP_NAME)); + Column oldColumn = new Column("v", new VariantType(oldPatterns), true); + + Column newColumn = new Column("v", new VariantType(new ArrayList<>()), true); + + try { + oldColumn.checkSchemaChangeAllowed(newColumn); + Assert.fail("No exception throws."); + } catch (DdlException e) { + Assert.assertTrue(e.getMessage().contains("Can not change variant skip patterns")); + } + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/TypeTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/TypeTest.java index fe3e2b0bd0a2fa..7033bf9a910f45 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/TypeTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/TypeTest.java @@ -17,6 +17,8 @@ package org.apache.doris.catalog; +import org.apache.doris.thrift.TPatternType; + import org.junit.Assert; import org.junit.Test; @@ -128,6 +130,43 @@ public void testVariantPredefinedFieldsExactMatch() { Assert.assertFalse(Type.matchExactType(v1, v4, false)); } + @Test + public void testVariantSkipPatternsIgnoredInExactMatch() { + ArrayList fields1 = new ArrayList<>(); + fields1.add(new VariantField("typed_a", Type.INT, "", TPatternType.MATCH_NAME)); + fields1.add(new VariantField("debug_*", Type.STRING, "", TPatternType.SKIP_NAME_GLOB)); + fields1.add(new VariantField("secret", Type.STRING, "", TPatternType.SKIP_NAME)); + VariantType v1 = new VariantType(fields1); + + ArrayList fields2 = new ArrayList<>(); + fields2.add(new VariantField("typed_b", Type.INT, "", TPatternType.MATCH_NAME)); + fields2.add(new VariantField("tmp_*", Type.STRING, "", TPatternType.SKIP_NAME_GLOB)); + fields2.add(new VariantField("pwd", Type.STRING, "", TPatternType.SKIP_NAME)); + VariantType v2 = new VariantType(fields2); + + // Exact type check should only compare typed-path fields and ignore skip patterns. + Assert.assertTrue(Type.matchExactType(v1, v2, false)); + + ArrayList fields3 = new ArrayList<>(); + fields3.add(new VariantField("typed_a", Type.BIGINT, "", TPatternType.MATCH_NAME)); + fields3.add(new VariantField("debug_*", Type.STRING, "", TPatternType.SKIP_NAME_GLOB)); + VariantType v3 = new VariantType(fields3); + Assert.assertFalse(Type.matchExactType(v1, v3, false)); + } + + @Test + public void testVariantFieldSkipSqlAndMatchesField() { + VariantField skipExact = new VariantField("secret", Type.STRING, "", TPatternType.SKIP_NAME); + VariantField skipGlob = new VariantField("debug_*", Type.STRING, "", TPatternType.SKIP_NAME_GLOB); + VariantField typed = new VariantField("secret", Type.STRING, "", TPatternType.MATCH_NAME); + + Assert.assertEquals("SKIP MATCH_NAME 'secret'", skipExact.toSql(0)); + Assert.assertEquals("SKIP 'debug_*'", skipGlob.toSql(0)); + Assert.assertFalse(skipExact.matchesField(typed)); + Assert.assertFalse(typed.matchesField(skipExact)); + Assert.assertFalse(skipExact.equals(typed)); + } + // ===================== Mixed Nesting & Precision ===================== @Test public void testArrayMapStructCombinationWithPrecision() { diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java index 1263cc7e95d877..6f97db27cff8fe 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java @@ -47,6 +47,8 @@ import org.apache.doris.nereids.trees.plans.commands.ExplainCommand; import org.apache.doris.nereids.trees.plans.commands.ExplainCommand.ExplainLevel; import org.apache.doris.nereids.trees.plans.commands.ReplayCommand; +import org.apache.doris.nereids.trees.plans.commands.info.ColumnDefinition; +import org.apache.doris.nereids.trees.plans.commands.info.CreateTableInfo; import org.apache.doris.nereids.trees.plans.commands.merge.MergeIntoCommand; import org.apache.doris.nereids.trees.plans.logical.LogicalAggregate; import org.apache.doris.nereids.trees.plans.logical.LogicalCTE; @@ -61,10 +63,14 @@ import org.apache.doris.nereids.types.DateType; import org.apache.doris.nereids.types.DecimalV2Type; import org.apache.doris.nereids.types.DecimalV3Type; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VariantField; +import org.apache.doris.nereids.types.VariantType; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.GlobalVariable; import org.apache.doris.qe.SqlModeHelper; import org.apache.doris.qe.StmtExecutor; +import org.apache.doris.thrift.TPatternType; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; @@ -1515,6 +1521,75 @@ public void testMergeInto() throws Exception { Assertions.assertThrows(ParseException.class, () -> parser.parseSingle(invalidSql4)); } + @Test + public void testParseVariantSkipPatternsInCreateTable() { + NereidsParser parser = new NereidsParser(); + String sql = "create table t_skip_parse (\n" + + " id int,\n" + + " v variant\n" + + ")\n" + + "duplicate key(id)\n" + + "distributed by hash(id) buckets 1\n" + + "properties('replication_num'='1')"; + LogicalPlan logicalPlan = parser.parseSingle(sql); + Assertions.assertInstanceOf(CreateTableCommand.class, logicalPlan); + + CreateTableInfo createTableInfo = ((CreateTableCommand) logicalPlan).getCreateTableInfo(); + ColumnDefinition variantColumn = createTableInfo.getColumnDefinitions().stream() + .filter(c -> "v".equalsIgnoreCase(c.getName())) + .findFirst() + .orElseThrow(() -> new AssertionError("variant column not found")); + + Assertions.assertTrue(variantColumn.getType() instanceof VariantType); + VariantType variantType = (VariantType) variantColumn.getType(); + List variantPathPatterns = variantType.getVariantPathPatterns(); + Assertions.assertEquals(3, variantPathPatterns.size()); + + VariantField skipGlob = variantPathPatterns.get(0); + Assertions.assertTrue(skipGlob.isSkipPatternType()); + Assertions.assertEquals(StringType.INSTANCE, skipGlob.getDataType()); + Assertions.assertEquals(TPatternType.SKIP_NAME_GLOB, skipGlob.toCatalogDataType().getPatternType()); + Assertions.assertEquals("SKIP 'debug_*'", skipGlob.toSql()); + + VariantField skipExact = variantPathPatterns.get(1); + Assertions.assertTrue(skipExact.isSkipPatternType()); + Assertions.assertEquals(StringType.INSTANCE, skipExact.getDataType()); + Assertions.assertEquals(TPatternType.SKIP_NAME, skipExact.toCatalogDataType().getPatternType()); + Assertions.assertEquals("SKIP MATCH_NAME 'secret'", skipExact.toSql()); + + VariantField typedPattern = variantPathPatterns.get(2); + Assertions.assertTrue(typedPattern.isTypedPathPatternType()); + Assertions.assertEquals(TPatternType.MATCH_NAME_GLOB, typedPattern.toCatalogDataType().getPatternType()); + Assertions.assertTrue(typedPattern.matches("num_a")); + Assertions.assertEquals(1, variantType.getVariantTypedPathPatterns().size()); + } + + @Test + public void testParseVariantSkipOnlyWithDocMode() { + NereidsParser parser = new NereidsParser(); + String sql = "create table t_skip_doc_mode (\n" + + " id int,\n" + + " v variant\n" + + ")\n" + + "duplicate key(id)\n" + + "distributed by hash(id) buckets 1\n" + + "properties('replication_num'='1')"; + LogicalPlan logicalPlan = parser.parseSingle(sql); + Assertions.assertInstanceOf(CreateTableCommand.class, logicalPlan); + + CreateTableInfo createTableInfo = ((CreateTableCommand) logicalPlan).getCreateTableInfo(); + ColumnDefinition variantColumn = createTableInfo.getColumnDefinitions().stream() + .filter(c -> "v".equalsIgnoreCase(c.getName())) + .findFirst() + .orElseThrow(() -> new AssertionError("variant column not found")); + + VariantType variantType = (VariantType) variantColumn.getType(); + Assertions.assertTrue(variantType.getEnableVariantDocMode()); + Assertions.assertEquals(1, variantType.getVariantPathPatterns().size()); + Assertions.assertEquals(0, variantType.getVariantTypedPathPatterns().size()); + Assertions.assertTrue(variantType.getVariantPathPatterns().get(0).isSkipPatternType()); + } + @Test public void testUnnest() { String sql = "SELECT t.* FROM LATERAL unnest([1,2], ['hi','hello']) WITH ORDINALITY AS t(c1,c2);"; diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java index 66289238e86414..6be411de9fd357 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/types/VariantFieldMatchTest.java @@ -362,4 +362,70 @@ public void testGlobCharacterClass() { Assertions.assertFalse(field2.matches("int_1")); } + + @Test + public void testSkipPatternFlagsAndSql() { + VariantField skipExact = new VariantField("secret", StringType.INSTANCE, "", + TPatternType.SKIP_NAME.name()); + Assertions.assertTrue(skipExact.isSkipPatternType()); + Assertions.assertFalse(skipExact.isTypedPathPatternType()); + Assertions.assertEquals("SKIP MATCH_NAME 'secret'", skipExact.toSql()); + Assertions.assertEquals(TPatternType.SKIP_NAME, skipExact.toCatalogDataType().getPatternType()); + + VariantField skipGlob = new VariantField("debug_*", StringType.INSTANCE, "", + TPatternType.SKIP_NAME_GLOB.name()); + Assertions.assertTrue(skipGlob.isSkipPatternType()); + Assertions.assertFalse(skipGlob.isTypedPathPatternType()); + Assertions.assertEquals("SKIP 'debug_*'", skipGlob.toSql()); + Assertions.assertEquals(TPatternType.SKIP_NAME_GLOB, skipGlob.toCatalogDataType().getPatternType()); + } + + @Test + public void testSkipPatternNeverMatches() { + VariantField skipExact = new VariantField("secret", StringType.INSTANCE, "", + TPatternType.SKIP_NAME.name()); + VariantField skipGlob = new VariantField("debug_*", StringType.INSTANCE, "", + TPatternType.SKIP_NAME_GLOB.name()); + Assertions.assertFalse(skipExact.matches("secret")); + Assertions.assertFalse(skipGlob.matches("debug_x")); + } + + @Test + public void testFindMatchingFieldIgnoresSkipPatterns() { + VariantField skip = new VariantField("debug_*", StringType.INSTANCE, "", + TPatternType.SKIP_NAME_GLOB.name()); + VariantField typed = new VariantField("num_*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + VariantType variantType = new VariantType(ImmutableList.of(skip, typed)); + + Assertions.assertFalse(variantType.findMatchingField("debug_x").isPresent()); + Optional result = variantType.findMatchingField("num_a"); + Assertions.assertTrue(result.isPresent()); + Assertions.assertEquals(BigIntType.INSTANCE, result.get().getDataType()); + } + + @Test + public void testEqualsAndHashCodeIncludePatternType() { + VariantField typed = new VariantField("a", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + VariantField skip = new VariantField("a", BigIntType.INSTANCE, "", + TPatternType.SKIP_NAME_GLOB.name()); + + Assertions.assertNotEquals(typed, skip); + Assertions.assertNotEquals(typed.hashCode(), skip.hashCode()); + } + + @Test + public void testGetVariantTypedPathPatternsFiltersSkipPatterns() { + VariantField skip = new VariantField("debug_*", StringType.INSTANCE, "", + TPatternType.SKIP_NAME_GLOB.name()); + VariantField typed1 = new VariantField("num_*", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME_GLOB.name()); + VariantField typed2 = new VariantField("id", BigIntType.INSTANCE, "", + TPatternType.MATCH_NAME.name()); + VariantType variantType = new VariantType(ImmutableList.of(skip, typed1, typed2)); + + Assertions.assertEquals(3, variantType.getVariantPathPatterns().size()); + Assertions.assertEquals(2, variantType.getVariantTypedPathPatterns().size()); + } } diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index ff54f54aed7242..0ae4163dbe55fd 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -346,6 +346,8 @@ message AlterTabletPB { enum PatternTypePB { MATCH_NAME = 1; MATCH_NAME_GLOB = 2; + SKIP_NAME = 3; + SKIP_NAME_GLOB = 4; } message ColumnPB { diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift index 11125c006493a4..df4a50956fbff3 100644 --- a/gensrc/thrift/Descriptors.thrift +++ b/gensrc/thrift/Descriptors.thrift @@ -24,7 +24,9 @@ include "Partitions.thrift" enum TPatternType { MATCH_NAME = 1, - MATCH_NAME_GLOB = 2 + MATCH_NAME_GLOB = 2, + SKIP_NAME = 3, + SKIP_NAME_GLOB = 4 } enum TAccessPathType { @@ -97,8 +99,8 @@ struct TColumn { 24: optional i32 variant_max_sparse_column_statistics_size = 10000 25: optional i32 variant_sparse_hash_shard_count 26: optional bool variant_enable_doc_mode - 27: optional i64 variant_doc_materialization_min_rows - 28: optional i32 variant_doc_hash_shard_count + 27: optional i64 variant_doc_materialization_min_rows + 28: optional i32 variant_doc_hash_shard_count } struct TSlotDescriptor { diff --git a/regression-test/data/variant_p0/predefine/test_schema_template_skip.out b/regression-test/data/variant_p0/predefine/test_schema_template_skip.out new file mode 100644 index 00000000000000..37f9345d5d42c7 --- /dev/null +++ b/regression-test/data/variant_p0/predefine/test_schema_template_skip.out @@ -0,0 +1,101 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !skip_basic_glob_1 -- +1 visible +2 \N + +-- !skip_basic_glob_2 -- +1 \N +2 \N + +-- !skip_basic_glob_3 -- +1 \N +2 \N + +-- !skip_basic_glob_4 -- +1 \N +2 yes + +-- !skip_match_name_1 -- +1 \N + +-- !skip_match_name_2 -- +1 visible + +-- !skip_match_name_3 -- +1 open + +-- !skip_nested_1 -- +1 \N + +-- !skip_nested_2 -- +1 2 + +-- !skip_nested_3 -- +1 10 + +-- !skip_priority_1 -- +1 \N + +-- !skip_priority_2 -- +1 val + +-- !skip_invalid_glob_1 -- +1 x + +-- !skip_invalid_glob_2 -- +1 y + +-- !skip_glob_cross_1 -- +1 \N + +-- !skip_glob_cross_2 -- +1 2 + +-- !skip_glob_cross_3 -- +1 3 + +-- !skip_multi_1 -- +1 \N + +-- !skip_multi_2 -- +1 \N + +-- !skip_multi_3 -- +1 \N + +-- !skip_multi_4 -- +1 visible + +-- !skip_whole_col -- +1 {"normal_field":"visible"} +2 {"keep_me":"yes"} + +-- !skip_coexist_1 -- +1 \N + +-- !skip_coexist_2 -- +1 100 + +-- !skip_coexist_3 -- +1 val + +-- !skip_bulk_1 -- +0 + +-- !skip_bulk_2 -- +0 + +-- !skip_bulk_3 -- +0 + +-- !skip_bulk_4 -- +100 + +-- !skip_bulk_5 -- +100 + +-- !skip_bulk_6 -- +1 user_1 10 +50 user_50 500 +100 user_100 1000 + diff --git a/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy b/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy new file mode 100644 index 00000000000000..95a03c040c7363 --- /dev/null +++ b/regression-test/suites/variant_p0/predefine/test_schema_template_skip.groovy @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_schema_template_skip", "p0") { + sql """ set describe_extend_variant_column = true """ + sql """ set default_variant_enable_typed_paths_to_sparse = false """ + sql """ set default_variant_enable_doc_mode = false """ + + // Test 1: Basic SKIP glob + def tableName1 = "test_skip_basic_glob" + sql "DROP TABLE IF EXISTS ${tableName1}" + sql """CREATE TABLE ${tableName1} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName1} values(1, '{"debug_info":"secret","debug_trace":"trace_val","normal_field":"visible"}')""" + sql """insert into ${tableName1} values(2, '{"debug_level":5,"keep_me":"yes"}')""" + + qt_skip_basic_glob_1 """ SELECT id, data['normal_field'] FROM ${tableName1} ORDER BY id """ + qt_skip_basic_glob_2 """ SELECT id, data['debug_info'] FROM ${tableName1} ORDER BY id """ + qt_skip_basic_glob_3 """ SELECT id, data['debug_trace'] FROM ${tableName1} ORDER BY id """ + qt_skip_basic_glob_4 """ SELECT id, data['keep_me'] FROM ${tableName1} ORDER BY id """ + + // Test 2: SKIP MATCH_NAME exact match + def tableName2 = "test_skip_match_name" + sql "DROP TABLE IF EXISTS ${tableName2}" + sql """CREATE TABLE ${tableName2} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName2} values(1, '{"secret":"hidden","secret_key":"visible","public":"open"}')""" + + qt_skip_match_name_1 """ SELECT id, data['secret'] FROM ${tableName2} ORDER BY id """ + qt_skip_match_name_2 """ SELECT id, data['secret_key'] FROM ${tableName2} ORDER BY id """ + qt_skip_match_name_3 """ SELECT id, data['public'] FROM ${tableName2} ORDER BY id """ + + // Test 3: Nested path SKIP + def tableName3 = "test_skip_nested_path" + sql "DROP TABLE IF EXISTS ${tableName3}" + sql """CREATE TABLE ${tableName3} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName3} values(1, '{"a":{"b":{"temp_1":1,"temp_12":10,"keep":2}}}')""" + + qt_skip_nested_1 """ SELECT id, data['a']['b']['temp_1'] FROM ${tableName3} ORDER BY id """ + qt_skip_nested_2 """ SELECT id, data['a']['b']['keep'] FROM ${tableName3} ORDER BY id """ + // temp_12 has 2 chars after temp_, so '?' should NOT match it + qt_skip_nested_3 """ SELECT id, data['a']['b']['temp_12'] FROM ${tableName3} ORDER BY id """ + + // Test 4: SKIP takes priority over typed pattern + def tableName4 = "test_skip_priority" + sql "DROP TABLE IF EXISTS ${tableName4}" + sql """CREATE TABLE ${tableName4} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName4} values(1, '{"num_a":100,"other":"val"}')""" + + qt_skip_priority_1 """ SELECT id, data['num_a'] FROM ${tableName4} ORDER BY id """ + qt_skip_priority_2 """ SELECT id, data['other'] FROM ${tableName4} ORDER BY id """ + + // Test 5: Invalid skip glob is allowed in DDL (same behavior as typed path) + def tableName5 = "test_skip_invalid_glob" + sql "DROP TABLE IF EXISTS ${tableName5}" + sql """CREATE TABLE ${tableName5} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName5} values(1, '{"i":"x","invalid":"y"}')""" + qt_skip_invalid_glob_1 """ SELECT id, data['i'] FROM ${tableName5} ORDER BY id """ + qt_skip_invalid_glob_2 """ SELECT id, data['invalid'] FROM ${tableName5} ORDER BY id """ + + // Test 6: Glob cross-level matching — pattern spans nested path + def tableName6 = "test_skip_glob_cross_level" + sql "DROP TABLE IF EXISTS ${tableName6}" + sql """CREATE TABLE ${tableName6} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName6} values(1, '{"a":{"debug_x":1,"keep":2},"debug_y":3}')""" + + qt_skip_glob_cross_1 """ SELECT id, data['a']['debug_x'] FROM ${tableName6} ORDER BY id """ + qt_skip_glob_cross_2 """ SELECT id, data['a']['keep'] FROM ${tableName6} ORDER BY id """ + qt_skip_glob_cross_3 """ SELECT id, data['debug_y'] FROM ${tableName6} ORDER BY id """ + + // Test 7: Multiple SKIP patterns + def tableName7 = "test_skip_multiple" + sql "DROP TABLE IF EXISTS ${tableName7}" + sql """CREATE TABLE ${tableName7} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName7} values(1, '{"temp_data":"t","internal_id":1,"password":"secret","name":"visible"}')""" + + qt_skip_multi_1 """ SELECT id, data['temp_data'] FROM ${tableName7} ORDER BY id """ + qt_skip_multi_2 """ SELECT id, data['internal_id'] FROM ${tableName7} ORDER BY id """ + qt_skip_multi_3 """ SELECT id, data['password'] FROM ${tableName7} ORDER BY id """ + qt_skip_multi_4 """ SELECT id, data['name'] FROM ${tableName7} ORDER BY id """ + + // Test 8: SELECT whole column — skipped fields should not appear in JSON output + qt_skip_whole_col """ SELECT id, data FROM ${tableName1} ORDER BY id """ + + // Test 9: SKIP with non-conflicting typed pattern coexistence + def tableName9 = "test_skip_coexist" + sql "DROP TABLE IF EXISTS ${tableName9}" + sql """CREATE TABLE ${tableName9} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + sql """insert into ${tableName9} values(1, '{"debug_x":1,"num_a":100,"other":"val"}')""" + + qt_skip_coexist_1 """ SELECT id, data['debug_x'] FROM ${tableName9} ORDER BY id """ + qt_skip_coexist_2 """ SELECT id, data['num_a'] FROM ${tableName9} ORDER BY id """ + qt_skip_coexist_3 """ SELECT id, data['other'] FROM ${tableName9} ORDER BY id """ + + // Test 10: Bulk data — verify SKIP works correctly with larger dataset + def tableName11 = "test_skip_bulk" + sql "DROP TABLE IF EXISTS ${tableName11}" + sql """CREATE TABLE ${tableName11} ( + `id` bigint NULL, + `data` variant NOT NULL + ) ENGINE=OLAP DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( "replication_allocation" = "tag.location.default: 1")""" + + // Insert 100 rows with mixed fields + for (int i = 1; i <= 100; i++) { + sql """insert into ${tableName11} values(${i}, + '{"debug_id":${i},"debug_msg":"msg_${i}","internal":"secret_${i}","name":"user_${i}","value":${i * 10}}')""" + } + + // Skipped fields should all be NULL + qt_skip_bulk_1 """ SELECT count(*) FROM ${tableName11} WHERE data['debug_id'] IS NOT NULL """ + qt_skip_bulk_2 """ SELECT count(*) FROM ${tableName11} WHERE data['debug_msg'] IS NOT NULL """ + qt_skip_bulk_3 """ SELECT count(*) FROM ${tableName11} WHERE data['internal'] IS NOT NULL """ + // Non-skipped fields should all be present + qt_skip_bulk_4 """ SELECT count(*) FROM ${tableName11} WHERE data['name'] IS NOT NULL """ + qt_skip_bulk_5 """ SELECT count(*) FROM ${tableName11} WHERE data['value'] IS NOT NULL """ + // Spot check specific rows + qt_skip_bulk_6 """ SELECT id, data['name'], data['value'] FROM ${tableName11} WHERE id IN (1, 50, 100) ORDER BY id """ +}