Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions be/src/olap/tablet_meta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ bvar::Window<bvar::Adder<uint64_t>> g_contains_agg_with_cache_if_eligible_full_h
"g_contains_agg_with_cache_if_eligible_full_hit_1m",
&g_contains_agg_with_cache_if_eligible_full_hit, 60);

namespace {

inline PatternTypePB to_pattern_type_pb(TPatternType::type pattern_type) {
return static_cast<PatternTypePB>(pattern_type);
}

} // namespace

TabletMetaSharedPtr TabletMeta::create(
const TCreateTabletReq& request, const TabletUid& tablet_uid, uint64_t shard_id,
uint32_t next_unique_id,
Expand Down Expand Up @@ -533,13 +541,7 @@ void TabletMeta::init_column_from_tcolumn(uint32_t unique_id, const TColumn& tco
column->set_variant_max_subcolumns_count(tcolumn.column_type.variant_max_subcolumns_count);
}
if (tcolumn.__isset.pattern_type) {
switch (tcolumn.pattern_type) {
case TPatternType::MATCH_NAME:
column->set_pattern_type(PatternTypePB::MATCH_NAME);
break;
case TPatternType::MATCH_NAME_GLOB:
column->set_pattern_type(PatternTypePB::MATCH_NAME_GLOB);
}
column->set_pattern_type(to_pattern_type_pb(tcolumn.pattern_type));
}
if (tcolumn.__isset.variant_enable_typed_paths_to_sparse) {
column->set_variant_enable_typed_paths_to_sparse(
Expand Down
4 changes: 2 additions & 2 deletions be/src/olap/tablet_schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ void TabletColumn::init_from_pb(const ColumnPB& column) {
_variant.doc_hash_shard_count = column.variant_doc_hash_shard_count();
}
if (column.has_pattern_type()) {
_pattern_type = column.pattern_type();
_field_pattern_type = column.pattern_type();
}
}

Expand Down Expand Up @@ -755,7 +755,7 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const {
column->set_index_length(0);
}
column->set_variant_max_subcolumns_count(_variant.max_subcolumns_count);
column->set_pattern_type(_pattern_type);
column->set_pattern_type(_field_pattern_type);
column->set_variant_enable_typed_paths_to_sparse(_variant.enable_typed_paths_to_sparse);
column->set_variant_max_sparse_column_statistics_size(
_variant.max_sparse_column_statistics_size);
Expand Down
5 changes: 3 additions & 2 deletions be/src/olap/tablet_schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ class TabletColumn : public MetadataAdder<TabletColumn> {
_variant.max_subcolumns_count = variant_max_subcolumns_count;
}

PatternTypePB pattern_type() const { return _pattern_type; }
PatternTypePB pattern_type() const { return _field_pattern_type; }

bool variant_enable_typed_paths_to_sparse() const {
return _variant.enable_typed_paths_to_sparse;
Expand Down Expand Up @@ -320,7 +320,8 @@ class TabletColumn : public MetadataAdder<TabletColumn> {
// The extracted sub-columns from "variant" contain the following information:
int32_t _parent_col_unique_id = -1; // "variant" -> col_unique_id
vectorized::PathInDataPtr _column_path; // the path of the sub-columns themselves
PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB;
// When pattern_type is absent (legacy metadata), keep typed-path default behavior.
PatternTypePB _field_pattern_type = PatternTypePB::MATCH_NAME_GLOB;

VariantParams _variant;
};
Expand Down
174 changes: 163 additions & 11 deletions be/src/vec/common/variant_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <functional>
#include <list>
#include <memory>
#include <mutex>
Expand Down Expand Up @@ -64,6 +65,7 @@
#include "olap/tablet_fwd.h"
#include "olap/tablet_schema.h"
#include "re2/re2.h"
#include "re2/set.h"
#include "runtime/client_cache.h"
#include "runtime/define_primitive_type.h"
#include "runtime/exec_env.h"
Expand Down Expand Up @@ -130,6 +132,27 @@ inline void append_escaped_regex_char(std::string* regex_output, char ch) {

// Small LRU to cap compiled glob patterns
constexpr size_t kGlobRegexCacheCapacity = 256;
constexpr size_t kSkipRe2SetThreshold = 32;

struct TransparentStringHash {
using is_transparent = void;
size_t operator()(std::string_view s) const { return std::hash<std::string_view> {}(s); }
size_t operator()(const std::string& s) const {
return std::hash<std::string_view> {}(std::string_view(s));
}
};

struct TransparentStringEq {
using is_transparent = void;
bool operator()(std::string_view lhs, std::string_view rhs) const { return lhs == rhs; }
};

struct CompiledSkipMatcher {
phmap::flat_hash_set<std::string, TransparentStringHash, TransparentStringEq> exact_patterns;
std::vector<std::unique_ptr<RE2>> glob_regexes;
std::unique_ptr<RE2::Set> glob_regex_set;
bool use_re2_set = false;
};

struct GlobRegexCacheEntry {
std::shared_ptr<RE2> re2;
Expand Down Expand Up @@ -259,6 +282,120 @@ bool glob_match_re2(const std::string& glob_pattern, const std::string& candidat
return RE2::FullMatch(candidate_path, *compiled);
}

Status build_compiled_skip_matcher(
const std::vector<std::pair<std::string, PatternTypePB>>& skip_path_patterns,
bool enable_re2_set, std::shared_ptr<const CompiledSkipMatcher>* out) {
if (out == nullptr) {
return Status::InvalidArgument("Output pointer for compiled skip matcher is null");
}

auto matcher = std::make_shared<CompiledSkipMatcher>();
matcher->exact_patterns.reserve(skip_path_patterns.size());

std::vector<std::string> glob_regex_patterns;
glob_regex_patterns.reserve(skip_path_patterns.size());
for (const auto& [pattern, pt] : skip_path_patterns) {
if (is_skip_exact_path_pattern_type(pt)) {
matcher->exact_patterns.insert(pattern);
continue;
}
if (!is_skip_glob_path_pattern_type(pt)) {
continue;
}

std::string regex_pattern;
auto st = glob_to_regex(pattern, &regex_pattern);
if (!st.ok()) {
continue;
}
glob_regex_patterns.emplace_back(std::move(regex_pattern));
}

if (glob_regex_patterns.empty()) {
*out = std::move(matcher);
return Status::OK();
}

if (enable_re2_set && glob_regex_patterns.size() >= kSkipRe2SetThreshold) {
RE2::Options options;
auto set = std::make_unique<RE2::Set>(options, RE2::ANCHOR_BOTH);
for (const auto& regex_pattern : glob_regex_patterns) {
if (set->Add(regex_pattern, nullptr) < 0) {
return Status::InvalidArgument(
"Failed to add regexp '{}' into skip pattern matcher set", regex_pattern);
}
}
if (!set->Compile()) {
return Status::InvalidArgument("Failed to compile skip pattern matcher set");
}
matcher->glob_regex_set = std::move(set);
matcher->use_re2_set = true;
} else {
matcher->glob_regexes.reserve(glob_regex_patterns.size());
for (const auto& regex_pattern : glob_regex_patterns) {
auto compiled = std::make_unique<RE2>(regex_pattern);
if (!compiled->ok()) {
return Status::InvalidArgument(
"Invalid regexp '{}' generated from skip glob pattern: {}", regex_pattern,
compiled->error());
}
matcher->glob_regexes.emplace_back(std::move(compiled));
}
}

*out = std::move(matcher);
return Status::OK();
}

bool should_skip_path(const CompiledSkipMatcher& matcher, std::string_view path) {
if (matcher.exact_patterns.find(path) != matcher.exact_patterns.end()) {
return true;
}

if (matcher.use_re2_set) {
std::vector<int> matched_indexes;
return matcher.glob_regex_set->Match(path, &matched_indexes);
}

for (const auto& regex : matcher.glob_regexes) {
if (RE2::FullMatch(path, *regex)) {
return true;
}
}

return false;
}

namespace {

inline bool is_variant_skip_path_pattern_type(PatternTypePB pattern_type) {
return pattern_type == PatternTypePB::SKIP_NAME ||
pattern_type == PatternTypePB::SKIP_NAME_GLOB;
}

void collect_variant_skip_path_patterns_from_children(
const TabletColumn& column,
std::vector<std::pair<std::string, PatternTypePB>>* skip_path_patterns) {
skip_path_patterns->clear();
for (const auto& sub_column : column.get_sub_columns()) {
if (!is_variant_skip_path_pattern_type(sub_column->pattern_type())) {
continue;
}
skip_path_patterns->emplace_back(sub_column->name(), sub_column->pattern_type());
}
}

bool has_variant_typed_path_children(const TabletColumn& column) {
for (const auto& sub_column : column.get_sub_columns()) {
if (is_typed_path_pattern_type(sub_column->pattern_type())) {
return true;
}
}
return false;
}

} // namespace

size_t get_number_of_dimensions(const IDataType& type) {
if (const auto* type_array = typeid_cast<const DataTypeArray*>(&type)) {
return type_array->get_number_of_dimensions();
Expand Down Expand Up @@ -464,10 +601,11 @@ Status check_variant_has_no_ambiguous_paths(const PathsInData& tuple_paths) {
return Status::OK();
}

Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
const std::map<std::string, TabletColumnPtr>& typed_columns,
std::set<PathInData>* path_set) {
Status update_least_schema_internal(
const std::map<PathInData, DataTypes>& subcolumns_types, TabletSchemaSPtr& common_schema,
int32_t variant_col_unique_id,
const std::map<std::string, TabletColumnPtr>& typed_path_columns,
std::set<PathInData>* path_set) {
PathsInData tuple_paths;
DataTypes tuple_types;
CHECK(common_schema.use_count() == 1);
Expand Down Expand Up @@ -503,10 +641,10 @@ Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subco
// Append all common type columns of this variant
for (int i = 0; i < tuple_paths.size(); ++i) {
TabletColumn common_column;
// typed path not contains root part
// typed path does not include root part
auto path_without_root = tuple_paths[i].copy_pop_front().get_path();
if (typed_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) {
common_column = *typed_columns.at(path_without_root);
if (typed_path_columns.contains(path_without_root) && !tuple_paths[i].has_nested_part()) {
common_column = *typed_path_columns.at(path_without_root);
// parent unique id and path may not be init in write path
common_column.set_parent_unique_id(variant_col_unique_id);
common_column.set_path_info(tuple_paths[i]);
Expand All @@ -529,10 +667,13 @@ Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subco
Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
std::set<PathInData>* path_set) {
std::map<std::string, TabletColumnPtr> typed_columns;
std::map<std::string, TabletColumnPtr> typed_path_columns;
for (const TabletColumnPtr& col :
common_schema->column_by_uid(variant_col_unique_id).get_sub_columns()) {
typed_columns[col->name()] = col;
if (!is_typed_path_pattern_type(col->pattern_type())) {
continue;
}
typed_path_columns[col->name()] = col;
}
// Types of subcolumns by path from all tuples.
std::map<PathInData, DataTypes> subcolumns_types;
Expand All @@ -556,7 +697,7 @@ Status update_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
RETURN_IF_ERROR(check_variant_has_no_ambiguous_paths(all_paths));

return update_least_schema_internal(subcolumns_types, common_schema, variant_col_unique_id,
typed_columns, path_set);
typed_path_columns, path_set);
}

// Keep variant subcolumn BF support aligned with FE DDL checks.
Expand Down Expand Up @@ -1216,7 +1357,8 @@ Status VariantCompactionUtil::get_extended_compaction_schema(
uid_to_paths_set_info[column->unique_id()]);

// 4. append subcolumns
if (column->variant_max_subcolumns_count() > 0 || !column->get_sub_columns().empty()) {
if (column->variant_max_subcolumns_count() > 0 ||
has_variant_typed_path_children(*column)) {
get_compaction_subcolumns_from_subpaths(
uid_to_paths_set_info[column->unique_id()], column, target,
uid_to_variant_extended_info[column->unique_id()].path_to_data_types,
Expand Down Expand Up @@ -2039,13 +2181,23 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t
}

std::vector<ParseConfig> configs(variant_column_pos.size());
std::vector<std::vector<std::pair<std::string, PatternTypePB>>> variant_skip_path_patterns(
variant_column_pos.size());
for (size_t i = 0; i < variant_column_pos.size(); ++i) {
configs[i].enable_flatten_nested = tablet_schema.variant_flatten_nested();
const auto& column = tablet_schema.column(variant_column_pos[i]);
if (!column.is_variant_type()) {
return Status::InternalError("column is not variant type, column name: {}",
column.name());
}
// Set skip path patterns if configured on variant children.
collect_variant_skip_path_patterns_from_children(column, &variant_skip_path_patterns[i]);
if (!variant_skip_path_patterns[i].empty()) {
configs[i].skip_path_patterns = &variant_skip_path_patterns[i];
RETURN_IF_ERROR(build_compiled_skip_matcher(variant_skip_path_patterns[i], true,
&configs[i].compiled_skip_matcher));
configs[i].adaptive_skip_result_cache_capacity = true;
}
// if doc mode is not enabled, no need to parse to doc value column
if (!column.variant_enable_doc_mode()) {
configs[i].parse_to = ParseConfig::ParseTo::OnlySubcolumns;
Expand Down
51 changes: 47 additions & 4 deletions be/src/vec/common/variant_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cstddef>
#include <cstdint>
#include <map>
#include <memory>
#include <string>
#include <string_view>
#include <unordered_map>
Expand Down Expand Up @@ -65,12 +66,53 @@ const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__";
const std::string DOC_VALUE_COLUMN_PATH = "__DORIS_VARIANT_DOC_VALUE__";
namespace doris::vectorized::variant_util {

struct CompiledSkipMatcher;

inline bool is_typed_path_pattern_type(PatternTypePB pattern_type) {
return pattern_type == PatternTypePB::MATCH_NAME ||
pattern_type == PatternTypePB::MATCH_NAME_GLOB;
}

inline bool is_skip_exact_path_pattern_type(PatternTypePB pattern_type) {
return pattern_type == PatternTypePB::SKIP_NAME;
}

inline bool is_skip_glob_path_pattern_type(PatternTypePB pattern_type) {
return pattern_type == PatternTypePB::SKIP_NAME_GLOB;
}

// Convert a restricted glob pattern into a regex (for tests/internal use).
Status glob_to_regex(const std::string& glob_pattern, std::string* regex_pattern);

// Match a glob pattern against a path using RE2.
bool glob_match_re2(const std::string& glob_pattern, const std::string& candidate_path);

// Build an immutable matcher for skip path patterns used in hot parsing paths.
Status build_compiled_skip_matcher(
const std::vector<std::pair<std::string, PatternTypePB>>& skip_path_patterns,
bool enable_re2_set, std::shared_ptr<const CompiledSkipMatcher>* out);

// Match a dot-separated path against precompiled skip path patterns.
bool should_skip_path(const CompiledSkipMatcher& matcher, std::string_view path);

// Check if a dot-separated path should be skipped based on skip path patterns.
// For SKIP_NAME_GLOB, uses glob matching; for SKIP_NAME, uses exact string comparison.
inline bool should_skip_path(
const std::vector<std::pair<std::string, PatternTypePB>>& skip_path_patterns,
const std::string& path) {
for (const auto& [pattern, pt] : skip_path_patterns) {
if (is_skip_exact_path_pattern_type(pt) && path == pattern) {
return true;
}
}
for (const auto& [pattern, pt] : skip_path_patterns) {
if (is_skip_glob_path_pattern_type(pt) && glob_match_re2(pattern, path)) {
return true;
}
}
return false;
}

using PathToNoneNullValues = std::unordered_map<std::string, int64_t>;
using PathToDataTypes = std::unordered_map<PathInData, std::vector<DataTypePtr>, PathInData::Hash>;

Expand Down Expand Up @@ -166,10 +208,11 @@ bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
bool inherit_index(const std::vector<const TabletIndex*>& parent_indexes,
TabletIndexes& sub_column_indexes, const segment_v2::ColumnMetaPB& column_pb);

Status update_least_schema_internal(const std::map<PathInData, DataTypes>& subcolumns_types,
TabletSchemaSPtr& common_schema, int32_t variant_col_unique_id,
const std::map<std::string, TabletColumnPtr>& typed_columns,
std::set<PathInData>* path_set = nullptr);
Status update_least_schema_internal(
const std::map<PathInData, DataTypes>& subcolumns_types, TabletSchemaSPtr& common_schema,
int32_t variant_col_unique_id,
const std::map<std::string, TabletColumnPtr>& typed_path_columns,
std::set<PathInData>* path_set = nullptr);

bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id,
const std::string& path,
Expand Down
Loading