diff --git a/be/benchmark/benchmark_column_read_order.hpp b/be/benchmark/benchmark_column_read_order.hpp
new file mode 100644
index 00000000000000..24cde997c31fa1
--- /dev/null
+++ b/be/benchmark/benchmark_column_read_order.hpp
@@ -0,0 +1,525 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <numeric>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "vec/exec/format/parquet/column_read_order_ctx.h"
+
+namespace doris::vectorized {
+
+// ============================================================================
+// P0-2 Benchmark: Predicate Column Read Order Optimization
+//
+// This benchmark compares three strategies:
+//
+//  1. AllAtOnce (baseline)
+//     Read ALL predicate columns fully (all rows), then evaluate filters.
+//     This is the original _do_lazy_read() path with no P0-1 or P0-2.
+//
+//  2. PerCol_NoPushdown (P0-2 only, no P0-1)
+//     Read columns one-by-one with intermediate filtering. However, the
+//     decoder does NOT receive the filter bitmap — it still decodes ALL
+//     rows (num_rows). The benefit comes only from being able to skip
+//     evaluating conjuncts on already-filtered rows and potentially
+//     short-circuiting. In practice this means: decode cost is the same
+//     as AllAtOnce per column, but we evaluate filters earlier.
+//
+//  3. PerCol_WithPushdown (P0-2 + P0-1)
+//     Read columns one-by-one with intermediate filtering AND filter
+//     bitmap pushdown. The decoder only decodes surviving rows (via P0-1).
+//     This is the full optimized path.
+//
+// For each strategy we test BestOrder and WorstOrder column orderings.
+//
+// We also include Adaptive (ColumnReadOrderCtx) and overhead benchmarks.
+// ============================================================================
+
+// ---- Helper: generate a random filter with given selectivity ----
+static std::vector<uint8_t> p02_gen_column_filter(int num_rows, double selectivity, unsigned seed) {
+    std::mt19937 rng(seed);
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
+    std::vector<uint8_t> filter(num_rows);
+    for (int i = 0; i < num_rows; ++i) {
+        filter[i] = dist(rng) < selectivity ? 1 : 0;
+    }
+    return filter;
+}
+
+// ---- Helper: combine (AND) two filters ----
+static void p02_combine_filters(std::vector<uint8_t>& combined,
+                                const std::vector<uint8_t>& col_filter, int num_rows) {
+    for (int i = 0; i < num_rows; ++i) {
+        combined[i] &= col_filter[i];
+    }
+}
+
+// ---- Helper: count surviving rows ----
+static int p02_count_survivors(const std::vector<uint8_t>& filter, int num_rows) {
+    int count = 0;
+    for (int i = 0; i < num_rows; ++i) {
+        count += filter[i];
+    }
+    return count;
+}
+
+// Simulated decode WITH P0-1 pushdown:
+// Only touches surviving rows. Cost = survivors * per_row_cost.
+static void p02_decode_with_pushdown(const std::vector<uint8_t>& surviving_filter, int num_rows,
+                                     int per_row_cost, std::vector<uint8_t>& scratch) {
+    if (static_cast<int>(scratch.size()) < num_rows * per_row_cost) {
+        scratch.resize(num_rows * per_row_cost);
+    }
+    int offset = 0;
+    for (int i = 0; i < num_rows; ++i) {
+        if (surviving_filter[i]) {
+            memset(scratch.data() + offset, static_cast<int>(i & 0xFF), per_row_cost);
+            offset += per_row_cost;
+        }
+    }
+    benchmark::DoNotOptimize(scratch.data());
+    benchmark::ClobberMemory();
+}
+
+// Simulated decode WITHOUT P0-1 pushdown:
+// Decodes ALL rows regardless of filter. Cost = num_rows * per_row_cost.
+// This is what the decoder does when it doesn't receive filter_data.
+static void p02_decode_no_pushdown(int num_rows, int per_row_cost, std::vector<uint8_t>& scratch) {
+    int total = num_rows * per_row_cost;
+    if (static_cast<int>(scratch.size()) < total) {
+        scratch.resize(total);
+    }
+    memset(scratch.data(), 0x42, total);
+    benchmark::DoNotOptimize(scratch.data());
+    benchmark::ClobberMemory();
+}
+
+// ---- Column config for simulation ----
+struct P02SimColumn {
+    int cost;                    // per-row decode cost in bytes
+    double selectivity;          // fraction of rows passing this column's filter
+    std::vector<uint8_t> filter; // pre-generated filter
+};
+
+static std::vector<P02SimColumn> p02_build_sim_columns(int num_rows, int num_cols,
+                                                       const std::vector<int>& costs,
+                                                       const std::vector<double>& selectivities) {
+    std::vector<P02SimColumn> cols(num_cols);
+    for (int i = 0; i < num_cols; ++i) {
+        cols[i].cost = costs[i];
+        cols[i].selectivity = selectivities[i];
+        cols[i].filter = p02_gen_column_filter(num_rows, selectivities[i], 1000 + i);
+    }
+    return cols;
+}
+
+// ---- Scenario setup helper ----
+static void p02_setup_scenario(int num_cols, int scenario, std::vector<int>& costs,
+                               std::vector<double>& selectivities) {
+    costs.resize(num_cols);
+    selectivities.resize(num_cols);
+    for (int i = 0; i < num_cols; ++i) {
+        costs[i] = 32;
+    }
+    switch (scenario) {
+    case 0: // skewed: one column 1%, rest 90%
+        for (int i = 0; i < num_cols; ++i) {
+            selectivities[i] = (i == num_cols - 1) ? 0.01 : 0.90;
+        }
+        break;
+    case 1: // uniform: all 50%
+        for (int i = 0; i < num_cols; ++i) {
+            selectivities[i] = 0.50;
+        }
+        break;
+    case 2: // cascading: 80% -> 20%
+        for (int i = 0; i < num_cols; ++i) {
+            selectivities[i] = 0.80 - i * (0.60 / std::max(num_cols - 1, 1));
+            if (selectivities[i] < 0.05) selectivities[i] = 0.05;
+        }
+        break;
+    default:
+        break;
+    }
+}
+
+static std::string p02_scenario_name(int scenario) {
+    switch (scenario) {
+    case 0:
+        return "skewed";
+    case 1:
+        return "uniform";
+    case 2:
+        return "cascading";
+    default:
+        return "unknown";
+    }
+}
+
+// Sort order helpers
+static std::vector<int> p02_best_order(const std::vector<P02SimColumn>& cols, int num_cols) {
+    std::vector<int> order(num_cols);
+    std::iota(order.begin(), order.end(), 0);
+    std::sort(order.begin(), order.end(),
+              [&](int a, int b) { return cols[a].selectivity < cols[b].selectivity; });
+    return order;
+}
+
+static std::vector<int> p02_worst_order(const std::vector<P02SimColumn>& cols, int num_cols) {
+    std::vector<int> order(num_cols);
+    std::iota(order.begin(), order.end(), 0);
+    std::sort(order.begin(), order.end(),
+              [&](int a, int b) { return cols[a].selectivity > cols[b].selectivity; });
+    return order;
+}
+
+// ============================================================================
+// Benchmark 1: AllAtOnce — Baseline (no P0-1, no P0-2)
+//
+// Read ALL predicate columns fully (all rows decoded), then filter.
+// Total decode work = num_cols * num_rows * per_row_cost
+// ============================================================================
+static void BM_P02_AllAtOnce(benchmark::State& state) {
+    int num_cols = static_cast<int>(state.range(0));
+    int num_rows = static_cast<int>(state.range(1)) * 1000;
+    int scenario = static_cast<int>(state.range(2));
+
+    std::vector<int> costs;
+    std::vector<double> selectivities;
+    p02_setup_scenario(num_cols, scenario, costs, selectivities);
+    auto columns = p02_build_sim_columns(num_rows, num_cols, costs, selectivities);
+    std::vector<uint8_t> scratch;
+
+    for (auto _ : state) {
+        // Phase 1: decode ALL columns, ALL rows (no filter pushdown)
+        for (int c = 0; c < num_cols; ++c) {
+            p02_decode_no_pushdown(num_rows, columns[c].cost, scratch);
+        }
+        // Phase 2: evaluate all filters at once
+        std::vector<uint8_t> combined(num_rows, 1);
+        for (int c = 0; c < num_cols; ++c) {
+            p02_combine_filters(combined, columns[c].filter, num_rows);
+        }
+        benchmark::DoNotOptimize(combined.data());
+    }
+
+    state.SetLabel("cols=" + std::to_string(num_cols) + " rows=" + std::to_string(num_rows) + " " +
+                   p02_scenario_name(scenario));
+    state.SetItemsProcessed(state.iterations() * num_rows * num_cols);
+}
+
+// ============================================================================
+// Benchmark 2: PerCol_NoPushdown — P0-2 only (no P0-1)
+//
+// Read columns one-by-one, evaluate per-col filter after each.
+// BUT decoder still decodes ALL rows (no filter bitmap pushdown).
+// Benefit: can skip conjunct evaluation for filtered rows, and if a
+// column filters everything, remaining columns don't need to be read.
+// Cost: same decode work per column as AllAtOnce.
+// ============================================================================
+static void BM_P02_PerCol_NoPushdown_Best(benchmark::State& state) {
+    int num_cols = static_cast<int>(state.range(0));
+    int num_rows = static_cast<int>(state.range(1)) * 1000;
+    int scenario = static_cast<int>(state.range(2));
+
+    std::vector<int> costs;
+    std::vector<double> selectivities;
+    p02_setup_scenario(num_cols, scenario, costs, selectivities);
+    auto columns = p02_build_sim_columns(num_rows, num_cols, costs, selectivities);
+    auto order = p02_best_order(columns, num_cols);
+    std::vector<uint8_t> scratch;
+
+    for (auto _ : state) {
+        std::vector<uint8_t> combined(num_rows, 1);
+        for (int idx = 0; idx < num_cols; ++idx) {
+            int c = order[idx];
+            // Decoder decodes ALL rows (no pushdown)
+            p02_decode_no_pushdown(num_rows, columns[c].cost, scratch);
+            // Evaluate per-col filter
+            p02_combine_filters(combined, columns[c].filter, num_rows);
+        }
+        benchmark::DoNotOptimize(combined.data());
+    }
+
+    state.SetLabel("cols=" + std::to_string(num_cols) + " rows=" + std::to_string(num_rows) + " " +
+                   p02_scenario_name(scenario));
+    state.SetItemsProcessed(state.iterations() * num_rows * num_cols);
+}
+
+static void BM_P02_PerCol_NoPushdown_Worst(benchmark::State& state) {
+    int num_cols = static_cast<int>(state.range(0));
+    int num_rows = static_cast<int>(state.range(1)) * 1000;
+    int scenario = static_cast<int>(state.range(2));
+
+    std::vector<int> costs;
+    std::vector<double> selectivities;
+    p02_setup_scenario(num_cols, scenario, costs, selectivities);
+    auto columns = p02_build_sim_columns(num_rows, num_cols, costs, selectivities);
+    auto order = p02_worst_order(columns, num_cols);
+    std::vector<uint8_t> scratch;
+
+    for (auto _ : state) {
+        std::vector<uint8_t> combined(num_rows, 1);
+        for (int idx = 0; idx < num_cols; ++idx) {
+            int c = order[idx];
+            p02_decode_no_pushdown(num_rows, columns[c].cost, scratch);
+            p02_combine_filters(combined, columns[c].filter, num_rows);
+        }
+        benchmark::DoNotOptimize(combined.data());
+    }
+
+    state.SetLabel("cols=" + std::to_string(num_cols) + " rows=" + std::to_string(num_rows) + " " +
+                   p02_scenario_name(scenario));
+    state.SetItemsProcessed(state.iterations() * num_rows * num_cols);
+}
+
+// ============================================================================
+// Benchmark 3: PerCol_WithPushdown — P0-2 + P0-1 (full optimization)
+//
+// Read columns one-by-one, evaluate per-col filter after each.
+// Decoder receives accumulated filter bitmap and ONLY decodes surviving rows.
+// This is the full P0-2 + P0-1 path.
+// ============================================================================
+static void BM_P02_PerCol_WithPushdown_Best(benchmark::State& state) {
+    int num_cols = static_cast<int>(state.range(0));
+    int num_rows = static_cast<int>(state.range(1)) * 1000;
+    int scenario = static_cast<int>(state.range(2));
+
+    std::vector<int> costs;
+    std::vector<double> selectivities;
+    p02_setup_scenario(num_cols, scenario, costs, selectivities);
+    auto columns = p02_build_sim_columns(num_rows, num_cols, costs, selectivities);
+    auto order = p02_best_order(columns, num_cols);
+    std::vector<uint8_t> scratch;
+
+    for (auto _ : state) {
+        std::vector<uint8_t> combined(num_rows, 1);
+        for (int idx = 0; idx < num_cols; ++idx) {
+            int c = order[idx];
+            // Decoder only decodes surviving rows (P0-1 pushdown)
+            p02_decode_with_pushdown(combined, num_rows, columns[c].cost, scratch);
+            // Evaluate per-col filter
+            p02_combine_filters(combined, columns[c].filter, num_rows);
+        }
+        benchmark::DoNotOptimize(combined.data());
+    }
+
+    state.SetLabel("cols=" + std::to_string(num_cols) + " rows=" + std::to_string(num_rows) + " " +
+                   p02_scenario_name(scenario));
+    state.SetItemsProcessed(state.iterations() * num_rows * num_cols);
+}
+
+static void BM_P02_PerCol_WithPushdown_Worst(benchmark::State& state) {
+    int num_cols = static_cast<int>(state.range(0));
+    int num_rows = static_cast<int>(state.range(1)) * 1000;
+    int scenario = static_cast<int>(state.range(2));
+
+    std::vector<int> costs;
+    std::vector<double> selectivities;
+    p02_setup_scenario(num_cols, scenario, costs, selectivities);
+    auto columns = p02_build_sim_columns(num_rows, num_cols, costs, selectivities);
+    auto order = p02_worst_order(columns, num_cols);
+    std::vector<uint8_t> scratch;
+
+    for (auto _ : state) {
+        std::vector<uint8_t> combined(num_rows, 1);
+        for (int idx = 0; idx < num_cols; ++idx) {
+            int c = order[idx];
+            p02_decode_with_pushdown(combined, num_rows, columns[c].cost, scratch);
+            p02_combine_filters(combined, columns[c].filter, num_rows);
+        }
+        benchmark::DoNotOptimize(combined.data());
+    }
+
+    state.SetLabel("cols=" + std::to_string(num_cols) + " rows=" + std::to_string(num_rows) + " " +
+                   p02_scenario_name(scenario));
+    state.SetItemsProcessed(state.iterations() * num_rows * num_cols);
+}
+
+// ============================================================================
+// Benchmark 4: PerCol_WithPushdown_Adaptive — P0-2 + P0-1 with Ctx
+//
+// Full path with ColumnReadOrderCtx adaptive ordering.
+// Runs 20 batches (10 exploration + 10 exploitation).
+// ============================================================================
+static void BM_P02_PerCol_Adaptive(benchmark::State& state) {
+    int num_cols = static_cast<int>(state.range(0));
+    int num_rows = static_cast<int>(state.range(1)) * 1000;
+    int scenario = static_cast<int>(state.range(2));
+
+    std::vector<int> costs;
+    std::vector<double> selectivities;
+    p02_setup_scenario(num_cols, scenario, costs, selectivities);
+    auto columns = p02_build_sim_columns(num_rows, num_cols, costs, selectivities);
+    std::vector<uint8_t> scratch;
+
+    for (auto _ : state) {
+        std::vector<size_t> col_indices(num_cols);
+        std::iota(col_indices.begin(), col_indices.end(), 0);
+        std::unordered_map<size_t, size_t> cost_map;
+        size_t total_cost = 0;
+        for (int i = 0; i < num_cols; ++i) {
+            cost_map[i] = columns[i].cost;
+            total_cost += columns[i].cost;
+        }
+        ColumnReadOrderCtx ctx(col_indices, cost_map, total_cost * num_rows);
+
+        for (int batch = 0; batch < 20; ++batch) {
+            const auto& read_order = ctx.get_column_read_order();
+
+            std::vector<uint8_t> combined(num_rows, 1);
+            size_t round_cost = 0;
+            double first_selectivity = 1.0;
+
+            for (size_t idx = 0; idx < read_order.size(); ++idx) {
+                size_t c = read_order[idx];
+                int survivors = p02_count_survivors(combined, num_rows);
+                round_cost += survivors * columns[c].cost;
+                // P0-1 pushdown: only decode surviving rows
+                p02_decode_with_pushdown(combined, num_rows, columns[c].cost, scratch);
+                p02_combine_filters(combined, columns[c].filter, num_rows);
+
+                if (idx == 0) {
+                    int new_survivors = p02_count_survivors(combined, num_rows);
+                    first_selectivity =
+                            survivors > 0 ? static_cast<double>(new_survivors) / survivors : 0.0;
+                }
+            }
+
+            ctx.update(round_cost, first_selectivity);
+            benchmark::DoNotOptimize(combined.data());
+        }
+    }
+
+    state.SetLabel("cols=" + std::to_string(num_cols) + " rows=" + std::to_string(num_rows) + " " +
+                   p02_scenario_name(scenario) + " 20batches");
+    state.SetItemsProcessed(state.iterations() * num_rows * num_cols * 20);
+}
+
+// ============================================================================
+// Benchmark 5: Filter Accumulation (bitwise AND) overhead
+// ============================================================================
+static void BM_P02_FilterAccumulation(benchmark::State& state) {
+    int num_cols = static_cast<int>(state.range(0));
+    int num_rows = static_cast<int>(state.range(1)) * 1000;
+
+    std::vector<std::vector<uint8_t>> filters(num_cols);
+    for (int i = 0; i < num_cols; ++i) {
+        filters[i] = p02_gen_column_filter(num_rows, 0.5, 2000 + i);
+    }
+
+    for (auto _ : state) {
+        std::vector<uint8_t> combined(num_rows, 1);
+        for (int c = 0; c < num_cols; ++c) {
+            p02_combine_filters(combined, filters[c], num_rows);
+        }
+        benchmark::DoNotOptimize(combined.data());
+        benchmark::ClobberMemory();
+    }
+
+    state.SetLabel("cols=" + std::to_string(num_cols) + " rows=" + std::to_string(num_rows));
+    state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(num_rows) * num_cols);
+}
+
+// ============================================================================
+// Benchmark 6: ColumnReadOrderCtx overhead
+// ============================================================================
+static void BM_P02_CtxOverhead(benchmark::State& state) {
+    int num_cols = static_cast<int>(state.range(0));
+
+    for (auto _ : state) {
+        std::vector<size_t> col_indices(num_cols);
+        std::iota(col_indices.begin(), col_indices.end(), 0);
+        std::unordered_map<size_t, size_t> cost_map;
+        size_t total_cost = 0;
+        for (int i = 0; i < num_cols; ++i) {
+            cost_map[i] = 32;
+            total_cost += 32;
+        }
+        ColumnReadOrderCtx ctx(col_indices, cost_map, total_cost * 4096);
+
+        for (int batch = 0; batch < 20; ++batch) {
+            const auto& order = ctx.get_column_read_order();
+            benchmark::DoNotOptimize(order.data());
+            size_t fake_cost = 1000 - batch * 30;
+            double fake_sel = 0.5 - batch * 0.02;
+            ctx.update(fake_cost, fake_sel);
+        }
+        benchmark::ClobberMemory();
+    }
+
+    state.SetLabel("cols=" + std::to_string(num_cols));
+    state.SetItemsProcessed(state.iterations() * 20);
+}
+
+// ============================================================================
+// Registrations
+// ============================================================================
+// Args: (num_cols, num_rows_in_thousands, scenario)
+// Scenario: 0=skewed, 1=uniform, 2=cascading
+
+#define P02_COMMON_ARGS         \
+    ->Args({4, 100, 0})         \
+            ->Args({4, 100, 1}) \
+            ->Args({4, 100, 2}) \
+            ->Args({8, 100, 0}) \
+            ->Args({8, 100, 1}) \
+            ->Args({8, 100, 2}) \
+            ->Args({2, 100, 0}) \
+            ->Unit(benchmark::kMicrosecond)
+
+// --- Baseline: AllAtOnce (no P0-1, no P0-2) ---
+BENCHMARK(BM_P02_AllAtOnce) P02_COMMON_ARGS;
+
+// --- P0-2 only (no P0-1): PerCol with no decoder pushdown ---
+BENCHMARK(BM_P02_PerCol_NoPushdown_Best) P02_COMMON_ARGS;
+BENCHMARK(BM_P02_PerCol_NoPushdown_Worst) P02_COMMON_ARGS;
+
+// --- P0-2 + P0-1: PerCol with decoder pushdown ---
+BENCHMARK(BM_P02_PerCol_WithPushdown_Best) P02_COMMON_ARGS;
+BENCHMARK(BM_P02_PerCol_WithPushdown_Worst) P02_COMMON_ARGS;
+
+// --- P0-2 + P0-1 Adaptive ---
+BENCHMARK(BM_P02_PerCol_Adaptive) P02_COMMON_ARGS;
+
+// --- Filter Accumulation overhead ---
+BENCHMARK(BM_P02_FilterAccumulation)
+        ->Args({2, 100})
+        ->Args({4, 100})
+        ->Args({8, 100})
+        ->Args({4, 1000})
+        ->Unit(benchmark::kMicrosecond);
+
+// --- Ctx overhead ---
+BENCHMARK(BM_P02_CtxOverhead)
+        ->Args({2})
+        ->Args({4})
+        ->Args({8})
+        ->Args({16})
+        ->Unit(benchmark::kNanosecond);
+
+} // namespace doris::vectorized
diff --git a/be/benchmark/benchmark_lazy_dict_decode.hpp b/be/benchmark/benchmark_lazy_dict_decode.hpp
new file mode 100644
index 00000000000000..c8a2aba7400204
--- /dev/null
+++ b/be/benchmark/benchmark_lazy_dict_decode.hpp
@@ -0,0 +1,478 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "util/coding.h"
+#include "util/faststring.h"
+#include "util/rle_encoding.h"
+#include "util/slice.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_vector.h"
+#include "vec/common/custom_allocator.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/exec/format/parquet/byte_array_dict_decoder.h"
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris::vectorized {
+
+// ============================================================================
+// P0-3 Benchmark: Lazy Dictionary Decode for Lazy String Columns
+//
+// This benchmark isolates the P0-3 optimization from P0-1, measuring four
+// configurations for lazy string column reading in Phase 2:
+//
+//  1. Baseline (No P0-1, No P0-3):
+//     Decode all N rows directly from dict -> ColumnString.
+//     This is the original path for lazy columns.
+//
+//  2. P0-1 Only (No P0-3):
+//     Decode with filter_data pushdown: only surviving rows are decoded
+//     directly from dict -> ColumnString (via _lazy_decode_string_values).
+//
+//  3. P0-3 Only (No P0-1):
+//     Decode all N rows to ColumnInt32 (dict codes), then filter the int32
+//     column to keep only survivors, then convert_dict_column_to_string_column
+//     on the filtered (smaller) ColumnInt32.
+//
+//  4. P0-3 + P0-1:
+//     Decode with filter_data pushdown to ColumnInt32 (only surviving rows
+//     get dict codes), then convert_dict_column_to_string_column on the
+//     result. No intermediate filtering needed since decoder already skipped.
+//
+// Key dimensions: dict_size (cache effects), selectivity (filter ratio),
+// avg_str_len (string materialization cost).
+// ============================================================================
+
+// ---- Reuse helpers from P0-1 benchmark ----
+
+// Build dictionary data buffer for ByteArrayDictDecoder
+static std::tuple<DorisUniqueBufferPtr<uint8_t>, int32_t, size_t> p03_build_string_dict(
+        int dict_size, int avg_str_len) {
+    std::mt19937 rng(42);
+    std::vector<std::string> dict_strings;
+    dict_strings.reserve(dict_size);
+    for (int i = 0; i < dict_size; ++i) {
+        std::string s(avg_str_len, 'a');
+        for (int j = 0; j < avg_str_len; ++j) {
+            s[j] = 'a' + (rng() % 26);
+        }
+        std::string suffix = "_" + std::to_string(i);
+        if (static_cast<int>(suffix.size()) < avg_str_len) {
+            s = s.substr(0, avg_str_len - suffix.size()) + suffix;
+        }
+        dict_strings.push_back(s);
+    }
+
+    size_t total_size = 0;
+    for (auto& s : dict_strings) {
+        total_size += 4 + s.size();
+    }
+
+    auto dict_data = make_unique_buffer<uint8_t>(total_size);
+    size_t offset = 0;
+    for (auto& s : dict_strings) {
+        auto len = static_cast<uint32_t>(s.size());
+        encode_fixed32_le(dict_data.get() + offset, len);
+        offset += 4;
+        memcpy(dict_data.get() + offset, s.data(), len);
+        offset += len;
+    }
+
+    return {std::move(dict_data), static_cast<int32_t>(total_size), static_cast<size_t>(dict_size)};
+}
+
+// Build RLE-encoded dict index data
+static std::vector<uint8_t> p03_build_rle_dict_indexes(int num_values, int dict_size,
+                                                       unsigned seed = 123) {
+    std::mt19937 rng(seed);
+    int bit_width = 0;
+    int tmp = dict_size - 1;
+    while (tmp > 0) {
+        bit_width++;
+        tmp >>= 1;
+    }
+    if (bit_width == 0) bit_width = 1;
+
+    faststring buffer;
+    RleEncoder<uint32_t> encoder(&buffer, bit_width);
+    for (int i = 0; i < num_values; ++i) {
+        encoder.Put(rng() % dict_size);
+    }
+    encoder.Flush();
+
+    std::vector<uint8_t> result;
+    result.reserve(1 + buffer.size());
+    result.push_back(static_cast<uint8_t>(bit_width));
+    result.insert(result.end(), buffer.data(), buffer.data() + buffer.size());
+    return result;
+}
+
+// Build run_length_null_map (no nulls)
+static std::vector<uint16_t> p03_build_run_length_null_map(int num_values) {
+    std::vector<uint16_t> result;
+    int remaining = num_values;
+    while (remaining > 0) {
+        uint16_t chunk = static_cast<uint16_t>(std::min(remaining, 65535));
+        if (!result.empty()) {
+            result.push_back(0);
+        }
+        result.push_back(chunk);
+        remaining -= chunk;
+    }
+    return result;
+}
+
+// Build filter bitmap with given selectivity
+static std::vector<uint8_t> p03_build_filter_bitmap(int num_values, double selectivity,
+                                                    unsigned seed = 456) {
+    std::mt19937 rng(seed);
+    std::vector<uint8_t> filter(num_values);
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
+    for (int i = 0; i < num_values; ++i) {
+        filter[i] = dist(rng) < selectivity ? 1 : 0;
+    }
+    return filter;
+}
+
+// Helper: filter a ColumnInt32 by bitmap, keeping only rows where filter[i]==1
+static MutableColumnPtr p03_filter_int32_column(const ColumnInt32* src,
+                                                const std::vector<uint8_t>& filter_bitmap) {
+    auto result = ColumnInt32::create();
+    const auto& data = src->get_data();
+    for (size_t i = 0; i < data.size(); ++i) {
+        if (filter_bitmap[i]) {
+            result->insert_value(data[i]);
+        }
+    }
+    return result;
+}
+
+// ============================================================================
+// Group 1: Baseline — No P0-1, No P0-3
+//
+// Decode all rows dict -> ColumnString directly.
+// decode_values(ColumnString, is_dict_filter=false, filter_data=nullptr)
+// ============================================================================
+static void BM_P03_Baseline(benchmark::State& state) {
+    int dict_size = static_cast<int>(state.range(0));
+    double selectivity = state.range(1) / 100.0;
+    int num_values = static_cast<int>(state.range(2)) * 1000;
+    int avg_str_len = static_cast<int>(state.range(3));
+
+    auto [dict_buf, dict_len, dict_count] = p03_build_string_dict(dict_size, avg_str_len);
+    auto rle_data = p03_build_rle_dict_indexes(num_values, dict_size);
+    auto filter_bitmap = p03_build_filter_bitmap(num_values, selectivity);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        ByteArrayDictDecoder decoder;
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+
+        MutableColumnPtr column = ColumnString::create();
+        DataTypePtr data_type = std::make_shared<DataTypeString>();
+
+        auto run_length_null_map = p03_build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        static_cast<void>(filter_map.init(filter_bitmap.data(), filter_bitmap.size(), false));
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        // Decode ALL rows to ColumnString (no P0-1, no P0-3)
+        static_cast<void>(decoder.decode_values(column, data_type, select_vector, false, nullptr));
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("dict=" + std::to_string(dict_size) + " sel=" + std::to_string(state.range(1)) +
+                   "%" + " rows=" + std::to_string(num_values) +
+                   " strlen=" + std::to_string(avg_str_len));
+}
+
+// ============================================================================
+// Group 2: P0-1 Only — filter bitmap pushdown, decode to ColumnString
+//
+// decode_values(ColumnString, is_dict_filter=false, filter_data=bitmap)
+// Only surviving rows are decoded via _lazy_decode_string_values.
+// ============================================================================
+static void BM_P03_P01Only(benchmark::State& state) {
+    int dict_size = static_cast<int>(state.range(0));
+    double selectivity = state.range(1) / 100.0;
+    int num_values = static_cast<int>(state.range(2)) * 1000;
+    int avg_str_len = static_cast<int>(state.range(3));
+
+    auto [dict_buf, dict_len, dict_count] = p03_build_string_dict(dict_size, avg_str_len);
+    auto rle_data = p03_build_rle_dict_indexes(num_values, dict_size);
+    auto filter_bitmap = p03_build_filter_bitmap(num_values, selectivity);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        ByteArrayDictDecoder decoder;
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+
+        MutableColumnPtr column = ColumnString::create();
+        DataTypePtr data_type = std::make_shared<DataTypeString>();
+
+        auto run_length_null_map = p03_build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        static_cast<void>(filter_map.init(filter_bitmap.data(), filter_bitmap.size(), false));
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        // Decode with P0-1 pushdown: only surviving rows get string materialized
+        static_cast<void>(decoder.decode_values(column, data_type, select_vector, false,
+                                                filter_bitmap.data()));
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("dict=" + std::to_string(dict_size) + " sel=" + std::to_string(state.range(1)) +
+                   "%" + " rows=" + std::to_string(num_values) +
+                   " strlen=" + std::to_string(avg_str_len));
+}
+
+// ============================================================================
+// Group 3: P0-3 Only — decode all rows to int32, filter, then convert survivors
+//
+// decode_values(ColumnInt32, is_dict_filter=true, filter_data=nullptr)
+// Then filter ColumnInt32 by bitmap, then convert_dict_column_to_string_column.
+// ============================================================================
+static void BM_P03_P03Only(benchmark::State& state) {
+    int dict_size = static_cast<int>(state.range(0));
+    double selectivity = state.range(1) / 100.0;
+    int num_values = static_cast<int>(state.range(2)) * 1000;
+    int avg_str_len = static_cast<int>(state.range(3));
+
+    auto [dict_buf, dict_len, dict_count] = p03_build_string_dict(dict_size, avg_str_len);
+    auto rle_data = p03_build_rle_dict_indexes(num_values, dict_size);
+    auto filter_bitmap = p03_build_filter_bitmap(num_values, selectivity);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        ByteArrayDictDecoder decoder;
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+
+        MutableColumnPtr column = ColumnInt32::create();
+        DataTypePtr data_type = std::make_shared<DataTypeInt32>();
+
+        auto run_length_null_map = p03_build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        static_cast<void>(filter_map.init(filter_bitmap.data(), filter_bitmap.size(), false));
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        // Step 1: Decode ALL rows to ColumnInt32 dict codes (no P0-1 pushdown)
+        static_cast<void>(decoder.decode_values(column, data_type, select_vector, true, nullptr));
+
+        // Step 2: Filter the int32 column (simulate Phase 2 filtering)
+        const auto* int32_col = assert_cast<const ColumnInt32*>(column.get());
+        auto filtered_col = p03_filter_int32_column(int32_col, filter_bitmap);
+
+        // Step 3: Convert surviving dict codes to strings
+        const auto* filtered_int32 = assert_cast<const ColumnInt32*>(filtered_col.get());
+        auto string_col = decoder.convert_dict_column_to_string_column(filtered_int32);
+
+        benchmark::DoNotOptimize(string_col);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("dict=" + std::to_string(dict_size) + " sel=" + std::to_string(state.range(1)) +
+                   "%" + " rows=" + std::to_string(num_values) +
+                   " strlen=" + std::to_string(avg_str_len));
+}
+
+// ============================================================================
+// Group 4: P0-3 + P0-1 — decode surviving rows to int32, then convert
+//
+// decode_values(ColumnInt32, is_dict_filter=true, filter_data=bitmap)
+// Decoder skips filtered rows (P0-1). Output is already filtered int32 codes.
+// Then convert_dict_column_to_string_column on the (small) result.
+// ============================================================================
+static void BM_P03_P03PlusP01(benchmark::State& state) {
+    int dict_size = static_cast<int>(state.range(0));
+    double selectivity = state.range(1) / 100.0;
+    int num_values = static_cast<int>(state.range(2)) * 1000;
+    int avg_str_len = static_cast<int>(state.range(3));
+
+    auto [dict_buf, dict_len, dict_count] = p03_build_string_dict(dict_size, avg_str_len);
+    auto rle_data = p03_build_rle_dict_indexes(num_values, dict_size);
+    auto filter_bitmap = p03_build_filter_bitmap(num_values, selectivity);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        ByteArrayDictDecoder decoder;
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+
+        MutableColumnPtr column = ColumnInt32::create();
+        DataTypePtr data_type = std::make_shared<DataTypeInt32>();
+
+        auto run_length_null_map = p03_build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        static_cast<void>(filter_map.init(filter_bitmap.data(), filter_bitmap.size(), false));
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        // Step 1: Decode with P0-1 pushdown to ColumnInt32 (only survivors)
+        static_cast<void>(decoder.decode_values(column, data_type, select_vector, true,
+                                                filter_bitmap.data()));
+
+        // Step 2: Convert surviving dict codes to strings (column already filtered)
+        const auto* int32_col = assert_cast<const ColumnInt32*>(column.get());
+        auto string_col = decoder.convert_dict_column_to_string_column(int32_col);
+
+        benchmark::DoNotOptimize(string_col);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("dict=" + std::to_string(dict_size) + " sel=" + std::to_string(state.range(1)) +
+                   "%" + " rows=" + std::to_string(num_values) +
+                   " strlen=" + std::to_string(avg_str_len));
+}
+
+// ============================================================================
+// Group 5: Convert overhead — just the dict code -> string conversion
+//
+// Measure convert_dict_column_to_string_column in isolation for N rows.
+// ============================================================================
+static void BM_P03_ConvertOverhead(benchmark::State& state) {
+    int dict_size = static_cast<int>(state.range(0));
+    int num_values = static_cast<int>(state.range(1)) * 1000;
+    int avg_str_len = static_cast<int>(state.range(2));
+
+    auto [dict_buf, dict_len, dict_count] = p03_build_string_dict(dict_size, avg_str_len);
+
+    // Build a ColumnInt32 with random dict codes
+    std::mt19937 rng(789);
+    auto int32_col = ColumnInt32::create();
+    for (int i = 0; i < num_values; ++i) {
+        int32_col->insert_value(rng() % dict_size);
+    }
+
+    // We need a decoder with dict loaded for convert_dict_column_to_string_column
+    ByteArrayDictDecoder decoder;
+    {
+        auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+        memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+        static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+    }
+
+    const ColumnInt32* raw_ptr = int32_col.get();
+
+    for (auto _ : state) {
+        auto string_col = decoder.convert_dict_column_to_string_column(raw_ptr);
+        benchmark::DoNotOptimize(string_col);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("dict=" + std::to_string(dict_size) + " rows=" + std::to_string(num_values) +
+                   " strlen=" + std::to_string(avg_str_len));
+}
+
+// ============================================================================
+// Registrations
+// ============================================================================
+// Args: (dict_size, selectivity_percent, num_values_in_thousands, avg_str_len)
+
+// Core comparison: small dict (100), various selectivities, 100K rows
+// String lengths: 32 (short) and 128 (medium-long)
+#define P03_CORE_ARGS                     \
+    ->Args({100, 5, 100, 32})             \
+            ->Args({100, 10, 100, 32})    \
+            ->Args({100, 20, 100, 32})    \
+            ->Args({100, 50, 100, 32})    \
+            ->Args({100, 100, 100, 32})   \
+            ->Args({100, 5, 100, 128})    \
+            ->Args({100, 20, 100, 128})   \
+            ->Args({100, 50, 100, 128})   \
+            ->Args({100, 100, 100, 128})  \
+            ->Args({10000, 5, 100, 32})   \
+            ->Args({10000, 20, 100, 32})  \
+            ->Args({10000, 50, 100, 32})  \
+            ->Args({10000, 5, 100, 128})  \
+            ->Args({10000, 20, 100, 128}) \
+            ->Unit(benchmark::kMicrosecond)
+
+// --- Group 1: Baseline ---
+BENCHMARK(BM_P03_Baseline) P03_CORE_ARGS;
+
+// --- Group 2: P0-1 Only ---
+BENCHMARK(BM_P03_P01Only) P03_CORE_ARGS;
+
+// --- Group 3: P0-3 Only ---
+BENCHMARK(BM_P03_P03Only) P03_CORE_ARGS;
+
+// --- Group 4: P0-3 + P0-1 ---
+BENCHMARK(BM_P03_P03PlusP01) P03_CORE_ARGS;
+
+// --- Group 5: Convert overhead ---
+BENCHMARK(BM_P03_ConvertOverhead)
+        ->Args({100, 5, 32})
+        ->Args({100, 50, 32})
+        ->Args({100, 100, 32})
+        ->Args({100, 5, 128})
+        ->Args({100, 100, 128})
+        ->Args({10000, 5, 32})
+        ->Args({10000, 100, 32})
+        ->Args({10000, 5, 128})
+        ->Args({10000, 100, 128})
+        ->Unit(benchmark::kMicrosecond);
+
+} // namespace doris::vectorized
diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp
index 9c9aede58d1793..46f43af65aa79a 100644
--- a/be/benchmark/benchmark_main.cpp
+++ b/be/benchmark/benchmark_main.cpp
@@ -20,8 +20,12 @@
 #include "benchmark_bit_pack.hpp"
 #include "benchmark_bits.hpp"
 #include "benchmark_block_bloom_filter.hpp"
+#include "benchmark_column_read_order.hpp"
 #include "benchmark_fastunion.hpp"
 #include "benchmark_hll_merge.hpp"
+#include "benchmark_lazy_dict_decode.hpp"
+#include "benchmark_p1_decoder_opts.hpp"
+#include "benchmark_parquet_dict_decoder.hpp"
 #include "benchmark_string.hpp"
 #include "binary_cast_benchmark.hpp"
 #include "vec/columns/column_string.h"
diff --git a/be/benchmark/benchmark_p1_decoder_opts.hpp b/be/benchmark/benchmark_p1_decoder_opts.hpp
new file mode 100644
index 00000000000000..a02731d2a8bbaf
--- /dev/null
+++ b/be/benchmark/benchmark_p1_decoder_opts.hpp
@@ -0,0 +1,531 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "common/config.h"
+#include "util/coding.h"
+#include "util/faststring.h"
+#include "util/rle_encoding.h"
+#include "util/slice.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_vector.h"
+#include "vec/common/custom_allocator.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/exec/format/parquet/byte_array_dict_decoder.h"
+#include "vec/exec/format/parquet/fix_length_dict_decoder.hpp"
+#include "vec/exec/format/parquet/fix_length_plain_decoder.h"
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris::vectorized {
+
+// ============================================================================
+// P1-4/5/6 Benchmark: Independent + Combined Test Groups
+//
+// Test Matrix (for dict decoders):
+//   Group A (Baseline):   SIMD=off, Prefetch=off — pure scalar loop
+//   Group B (P1-4 Only):  SIMD=on,  Prefetch=off — SIMD gather, no prefetch
+//   Group C (P1-5 Only):  SIMD=off, Prefetch=on  — scalar loop + sw prefetch
+//   Group D (P1-4+P1-5):  SIMD=on,  Prefetch=on  — full optimized path
+//
+// For each group: INT32 / INT64 / String × dict={100, 10K, 1M} × rows={100K, 500K}
+//
+// Group E: P1-6 Plain Fast Path (independent, no config interaction)
+//   No-null memcpy fast path vs with-nulls run-loop × INT32/INT64 × rows={100K, 500K, 1M}
+// ============================================================================
+
+// ---- Helpers ----
+
+static std::tuple<DorisUniqueBufferPtr<uint8_t>, int32_t, size_t> p1_build_int32_dict(
+        int dict_size) {
+    auto dict_data = make_unique_buffer<uint8_t>(dict_size * sizeof(int32_t));
+    auto* ptr = reinterpret_cast<int32_t*>(dict_data.get());
+    for (int i = 0; i < dict_size; ++i) {
+        ptr[i] = i * 7 + 13;
+    }
+    return {std::move(dict_data), static_cast<int32_t>(dict_size * sizeof(int32_t)),
+            static_cast<size_t>(dict_size)};
+}
+
+static std::tuple<DorisUniqueBufferPtr<uint8_t>, int32_t, size_t> p1_build_int64_dict(
+        int dict_size) {
+    auto dict_data = make_unique_buffer<uint8_t>(dict_size * sizeof(int64_t));
+    auto* ptr = reinterpret_cast<int64_t*>(dict_data.get());
+    for (int i = 0; i < dict_size; ++i) {
+        ptr[i] = static_cast<int64_t>(i) * 17 + 42;
+    }
+    return {std::move(dict_data), static_cast<int32_t>(dict_size * sizeof(int64_t)),
+            static_cast<size_t>(dict_size)};
+}
+
+static std::tuple<DorisUniqueBufferPtr<uint8_t>, int32_t, size_t> p1_build_string_dict(
+        int dict_size, int avg_str_len) {
+    std::mt19937 rng(42);
+    std::vector<std::string> dict_strings;
+    dict_strings.reserve(dict_size);
+    for (int i = 0; i < dict_size; ++i) {
+        std::string s(avg_str_len, 'a');
+        for (int j = 0; j < avg_str_len; ++j) {
+            s[j] = 'a' + (rng() % 26);
+        }
+        std::string suffix = "_" + std::to_string(i);
+        s = s.substr(0, avg_str_len - suffix.size()) + suffix;
+        dict_strings.push_back(s);
+    }
+
+    size_t total_size = 0;
+    for (auto& s : dict_strings) {
+        total_size += 4 + s.size();
+    }
+
+    auto dict_data = make_unique_buffer<uint8_t>(total_size);
+    size_t offset = 0;
+    for (auto& s : dict_strings) {
+        auto len = static_cast<uint32_t>(s.size());
+        encode_fixed32_le(dict_data.get() + offset, len);
+        offset += 4;
+        memcpy(dict_data.get() + offset, s.data(), len);
+        offset += len;
+    }
+
+    return {std::move(dict_data), static_cast<int32_t>(total_size), static_cast<size_t>(dict_size)};
+}
+
+static std::vector<uint8_t> p1_build_rle_dict_indexes(int num_values, int dict_size,
+                                                      unsigned seed = 123) {
+    std::mt19937 rng(seed);
+    int bit_width = 0;
+    int tmp = dict_size - 1;
+    while (tmp > 0) {
+        bit_width++;
+        tmp >>= 1;
+    }
+    if (bit_width == 0) bit_width = 1;
+
+    faststring buffer;
+    RleEncoder<uint32_t> encoder(&buffer, bit_width);
+    for (int i = 0; i < num_values; ++i) {
+        encoder.Put(rng() % dict_size);
+    }
+    encoder.Flush();
+
+    std::vector<uint8_t> result;
+    result.reserve(1 + buffer.size());
+    result.push_back(static_cast<uint8_t>(bit_width));
+    result.insert(result.end(), buffer.data(), buffer.data() + buffer.size());
+    return result;
+}
+
+static std::vector<uint16_t> p1_build_run_length_null_map(int num_values) {
+    std::vector<uint16_t> result;
+    int remaining = num_values;
+    while (remaining > 0) {
+        uint16_t chunk = static_cast<uint16_t>(std::min(remaining, 65535));
+        if (!result.empty()) {
+            result.push_back(0);
+        }
+        result.push_back(chunk);
+        remaining -= chunk;
+    }
+    return result;
+}
+
+// ---- RAII config guard ----
+struct ConfigGuard {
+    bool saved_simd;
+    bool saved_prefetch;
+    ConfigGuard(bool simd, bool prefetch) {
+        saved_simd = config::enable_parquet_simd_dict_decode;
+        saved_prefetch = config::enable_parquet_dict_prefetch;
+        config::enable_parquet_simd_dict_decode = simd;
+        config::enable_parquet_dict_prefetch = prefetch;
+    }
+    ~ConfigGuard() {
+        config::enable_parquet_simd_dict_decode = saved_simd;
+        config::enable_parquet_dict_prefetch = saved_prefetch;
+    }
+};
+
+// ============================================================================
+// Parameterized INT32 Dict Decode Benchmark
+// Args: (dict_size, num_values_k)
+// The config mode is set before calling.
+// ============================================================================
+
+static void BM_INT32_DictDecode(benchmark::State& state, bool simd, bool prefetch,
+                                const std::string& label_prefix) {
+    int dict_size = static_cast<int>(state.range(0));
+    int num_values = static_cast<int>(state.range(1)) * 1000;
+
+    auto [dict_buf, dict_len, dict_count] = p1_build_int32_dict(dict_size);
+    auto rle_data = p1_build_rle_dict_indexes(num_values, dict_size);
+
+    ConfigGuard guard(simd, prefetch);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        FixLengthDictDecoder<tparquet::Type::INT32> decoder;
+        decoder.set_type_length(sizeof(int32_t));
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+        MutableColumnPtr column = ColumnInt32::create();
+        DataTypePtr data_type = std::make_shared<DataTypeInt32>();
+        auto run_length_null_map = p1_build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        auto status = decoder.decode_values(column, data_type, select_vector, false, nullptr);
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel(label_prefix + " dict=" + std::to_string(dict_size) +
+                   " rows=" + std::to_string(num_values));
+}
+
+// ============================================================================
+// Parameterized INT64 Dict Decode Benchmark
+// ============================================================================
+
+static void BM_INT64_DictDecode(benchmark::State& state, bool simd, bool prefetch,
+                                const std::string& label_prefix) {
+    int dict_size = static_cast<int>(state.range(0));
+    int num_values = static_cast<int>(state.range(1)) * 1000;
+
+    auto [dict_buf, dict_len, dict_count] = p1_build_int64_dict(dict_size);
+    auto rle_data = p1_build_rle_dict_indexes(num_values, dict_size);
+
+    ConfigGuard guard(simd, prefetch);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        FixLengthDictDecoder<tparquet::Type::INT64> decoder;
+        decoder.set_type_length(sizeof(int64_t));
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+        MutableColumnPtr column = ColumnInt64::create();
+        DataTypePtr data_type = std::make_shared<DataTypeInt64>();
+        auto run_length_null_map = p1_build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        auto status = decoder.decode_values(column, data_type, select_vector, false, nullptr);
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel(label_prefix + " dict=" + std::to_string(dict_size) +
+                   " rows=" + std::to_string(num_values));
+}
+
+// ============================================================================
+// Parameterized String Dict Decode Benchmark
+// ============================================================================
+
+static void BM_String_DictDecode(benchmark::State& state, bool simd, bool prefetch,
+                                 const std::string& label_prefix) {
+    int dict_size = static_cast<int>(state.range(0));
+    int num_values = static_cast<int>(state.range(1)) * 1000;
+    int avg_str_len = 32;
+
+    auto [dict_buf, dict_len, dict_count] = p1_build_string_dict(dict_size, avg_str_len);
+    auto rle_data = p1_build_rle_dict_indexes(num_values, dict_size);
+
+    ConfigGuard guard(simd, prefetch);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        ByteArrayDictDecoder decoder;
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+        MutableColumnPtr column = ColumnString::create();
+        DataTypePtr data_type = std::make_shared<DataTypeString>();
+        auto run_length_null_map = p1_build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        auto status = decoder.decode_values(column, data_type, select_vector, false, nullptr);
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel(label_prefix + " dict=" + std::to_string(dict_size) +
+                   " rows=" + std::to_string(num_values));
+}
+
+// ============================================================================
+// Group A: Baseline (SIMD=off, Prefetch=off)
+// ============================================================================
+
+static void BM_GroupA_INT32_Baseline(benchmark::State& state) {
+    BM_INT32_DictDecode(state, false, false, "A:Baseline");
+}
+static void BM_GroupA_INT64_Baseline(benchmark::State& state) {
+    BM_INT64_DictDecode(state, false, false, "A:Baseline");
+}
+static void BM_GroupA_String_Baseline(benchmark::State& state) {
+    BM_String_DictDecode(state, false, false, "A:Baseline");
+}
+
+// ============================================================================
+// Group B: P1-4 Only (SIMD=on, Prefetch=off)
+// ============================================================================
+
+static void BM_GroupB_INT32_SIMD(benchmark::State& state) {
+    BM_INT32_DictDecode(state, true, false, "B:SIMD");
+}
+static void BM_GroupB_INT64_SIMD(benchmark::State& state) {
+    BM_INT64_DictDecode(state, true, false, "B:SIMD");
+}
+static void BM_GroupB_String_SIMD(benchmark::State& state) {
+    BM_String_DictDecode(state, true, false, "B:SIMD");
+}
+
+// ============================================================================
+// Group C: P1-5 Only (SIMD=off, Prefetch=on)
+// ============================================================================
+
+static void BM_GroupC_INT32_Prefetch(benchmark::State& state) {
+    BM_INT32_DictDecode(state, false, true, "C:Prefetch");
+}
+static void BM_GroupC_INT64_Prefetch(benchmark::State& state) {
+    BM_INT64_DictDecode(state, false, true, "C:Prefetch");
+}
+static void BM_GroupC_String_Prefetch(benchmark::State& state) {
+    BM_String_DictDecode(state, false, true, "C:Prefetch");
+}
+
+// ============================================================================
+// Group D: P1-4+P1-5 Combined (SIMD=on, Prefetch=on)
+// ============================================================================
+
+static void BM_GroupD_INT32_SIMD_Prefetch(benchmark::State& state) {
+    BM_INT32_DictDecode(state, true, true, "D:SIMD+PF");
+}
+static void BM_GroupD_INT64_SIMD_Prefetch(benchmark::State& state) {
+    BM_INT64_DictDecode(state, true, true, "D:SIMD+PF");
+}
+static void BM_GroupD_String_SIMD_Prefetch(benchmark::State& state) {
+    BM_String_DictDecode(state, true, true, "D:SIMD+PF");
+}
+
+// ============================================================================
+// Group E: P1-6 Plain Fast Path (Independent)
+// ============================================================================
+
+static void BM_GroupE_PlainFastPath(benchmark::State& state) {
+    int num_values = static_cast<int>(state.range(0)) * 1000;
+    int type_length = static_cast<int>(state.range(1));
+
+    std::mt19937 rng(789);
+    size_t total_bytes = static_cast<size_t>(num_values) * type_length;
+    std::vector<char> plain_data(total_bytes);
+    for (size_t i = 0; i < total_bytes; ++i) {
+        plain_data[i] = static_cast<char>(rng() % 256);
+    }
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        FixLengthPlainDecoder decoder;
+        decoder.set_type_length(type_length);
+        Slice data_slice(plain_data.data(), plain_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+
+        MutableColumnPtr column = ColumnInt32::create();
+        DataTypePtr data_type = std::make_shared<DataTypeInt32>();
+        if (type_length == 8) {
+            column = ColumnInt64::create();
+            data_type = std::make_shared<DataTypeInt64>();
+        }
+
+        auto run_length_null_map = p1_build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        auto status = decoder.decode_values(column, data_type, select_vector, false, nullptr);
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("E:FastPath rows=" + std::to_string(num_values) +
+                   " type_len=" + std::to_string(type_length));
+}
+
+static void BM_GroupE_PlainWithNulls(benchmark::State& state) {
+    int num_values = static_cast<int>(state.range(0)) * 1000;
+    int type_length = static_cast<int>(state.range(1));
+
+    std::mt19937 rng(789);
+    size_t total_bytes = static_cast<size_t>(num_values) * type_length;
+    std::vector<char> plain_data(total_bytes);
+    for (size_t i = 0; i < total_bytes; ++i) {
+        plain_data[i] = static_cast<char>(rng() % 256);
+    }
+
+    // Build null map with ~10% nulls
+    std::vector<uint16_t> null_map;
+    std::mt19937 null_rng(456);
+    int remaining = num_values;
+    bool is_content = true;
+    while (remaining > 0) {
+        int run;
+        if (is_content) {
+            run = std::min(remaining, static_cast<int>(null_rng() % 50 + 5));
+        } else {
+            run = std::min(remaining, static_cast<int>(null_rng() % 5 + 1));
+        }
+        null_map.push_back(static_cast<uint16_t>(run));
+        remaining -= run;
+        is_content = !is_content;
+    }
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        FixLengthPlainDecoder decoder;
+        decoder.set_type_length(type_length);
+        Slice data_slice(plain_data.data(), plain_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+
+        MutableColumnPtr column = ColumnInt32::create();
+        DataTypePtr data_type = std::make_shared<DataTypeInt32>();
+        if (type_length == 8) {
+            column = ColumnInt64::create();
+            data_type = std::make_shared<DataTypeInt64>();
+        }
+
+        FilterMap null_filter_map;
+        ColumnSelectVector select_vector;
+        static_cast<void>(select_vector.init(null_map, num_values, nullptr, &null_filter_map, 0));
+        state.ResumeTiming();
+
+        auto status = decoder.decode_values(column, data_type, select_vector, false, nullptr);
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("E:WithNulls rows=" + std::to_string(num_values) +
+                   " type_len=" + std::to_string(type_length) + " nulls=10%");
+}
+
+// ============================================================================
+// Benchmark Registrations
+// ============================================================================
+
+// Standard args for dict decoders: (dict_size, num_values_k)
+// dict_size: 100 (L1), 10000 (L2), 1000000 (>L2)
+// rows_k: 100, 500
+
+#define DICT_BENCH_ARGS            \
+    ->Args({100, 100})             \
+            ->Args({100, 500})     \
+            ->Args({10000, 100})   \
+            ->Args({10000, 500})   \
+            ->Args({1000000, 100}) \
+            ->Args({1000000, 500}) \
+            ->Unit(benchmark::kMicrosecond)
+
+// =============================================
+// INT32 (all 4 groups in sequence for easy comparison)
+// =============================================
+
+BENCHMARK(BM_GroupA_INT32_Baseline) DICT_BENCH_ARGS;
+BENCHMARK(BM_GroupB_INT32_SIMD) DICT_BENCH_ARGS;
+BENCHMARK(BM_GroupC_INT32_Prefetch) DICT_BENCH_ARGS;
+BENCHMARK(BM_GroupD_INT32_SIMD_Prefetch) DICT_BENCH_ARGS;
+
+// =============================================
+// INT64 (all 4 groups)
+// =============================================
+
+BENCHMARK(BM_GroupA_INT64_Baseline) DICT_BENCH_ARGS;
+BENCHMARK(BM_GroupB_INT64_SIMD) DICT_BENCH_ARGS;
+BENCHMARK(BM_GroupC_INT64_Prefetch) DICT_BENCH_ARGS;
+BENCHMARK(BM_GroupD_INT64_SIMD_Prefetch) DICT_BENCH_ARGS;
+
+// =============================================
+// String (all 4 groups)
+// =============================================
+
+BENCHMARK(BM_GroupA_String_Baseline) DICT_BENCH_ARGS;
+BENCHMARK(BM_GroupB_String_SIMD) DICT_BENCH_ARGS;
+BENCHMARK(BM_GroupC_String_Prefetch) DICT_BENCH_ARGS;
+BENCHMARK(BM_GroupD_String_SIMD_Prefetch) DICT_BENCH_ARGS;
+
+#undef DICT_BENCH_ARGS
+
+// =============================================
+// P1-6 Plain Fast Path (Group E)
+// =============================================
+
+BENCHMARK(BM_GroupE_PlainFastPath)
+        ->Args({100, 4})
+        ->Args({500, 4})
+        ->Args({1000, 4})
+        ->Args({100, 8})
+        ->Args({500, 8})
+        ->Args({1000, 8})
+        ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_GroupE_PlainWithNulls)
+        ->Args({100, 4})
+        ->Args({500, 4})
+        ->Args({1000, 4})
+        ->Args({100, 8})
+        ->Args({500, 8})
+        ->Args({1000, 8})
+        ->Unit(benchmark::kMicrosecond);
+
+} // namespace doris::vectorized
diff --git a/be/benchmark/benchmark_parquet_dict_decoder.hpp b/be/benchmark/benchmark_parquet_dict_decoder.hpp
new file mode 100644
index 00000000000000..94a4f4b92f4d82
--- /dev/null
+++ b/be/benchmark/benchmark_parquet_dict_decoder.hpp
@@ -0,0 +1,498 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <benchmark/benchmark.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "util/coding.h"
+#include "util/faststring.h"
+#include "util/rle_encoding.h"
+#include "util/slice.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_vector.h"
+#include "vec/common/custom_allocator.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/exec/format/parquet/byte_array_dict_decoder.h"
+#include "vec/exec/format/parquet/fix_length_dict_decoder.hpp"
+#include "vec/exec/format/parquet/parquet_common.h"
+
+namespace doris::vectorized {
+
+// ============================================================================
+// Helper: Build dictionary data buffer for ByteArrayDictDecoder
+// ============================================================================
+// Returns (dict_buffer, total_size, num_entries)
+static std::tuple<DorisUniqueBufferPtr<uint8_t>, int32_t, size_t> build_string_dict(
+        int dict_size, int avg_str_len) {
+    // Generate deterministic dictionary strings
+    std::mt19937 rng(42);
+    std::vector<std::string> dict_strings;
+    dict_strings.reserve(dict_size);
+    for (int i = 0; i < dict_size; ++i) {
+        // Create a string of avg_str_len with random content
+        std::string s(avg_str_len, 'a');
+        for (int j = 0; j < avg_str_len; ++j) {
+            s[j] = 'a' + (rng() % 26);
+        }
+        // Append index to ensure uniqueness
+        std::string suffix = "_" + std::to_string(i);
+        s = s.substr(0, avg_str_len - suffix.size()) + suffix;
+        dict_strings.push_back(s);
+    }
+
+    // Calculate total dict data size (4-byte length prefix + string data)
+    size_t total_size = 0;
+    for (auto& s : dict_strings) {
+        total_size += 4 + s.size();
+    }
+
+    auto dict_data = make_unique_buffer<uint8_t>(total_size);
+    size_t offset = 0;
+    for (auto& s : dict_strings) {
+        auto len = static_cast<uint32_t>(s.size());
+        encode_fixed32_le(dict_data.get() + offset, len);
+        offset += 4;
+        memcpy(dict_data.get() + offset, s.data(), len);
+        offset += len;
+    }
+
+    return {std::move(dict_data), static_cast<int32_t>(total_size), static_cast<size_t>(dict_size)};
+}
+
+// ============================================================================
+// Helper: Build dictionary data buffer for FixLengthDictDecoder<INT32>
+// ============================================================================
+static std::tuple<DorisUniqueBufferPtr<uint8_t>, int32_t, size_t> build_int32_dict(int dict_size) {
+    auto dict_data = make_unique_buffer<uint8_t>(dict_size * sizeof(int32_t));
+    auto* ptr = reinterpret_cast<int32_t*>(dict_data.get());
+    for (int i = 0; i < dict_size; ++i) {
+        ptr[i] = i * 7 + 13; // Arbitrary distinct values
+    }
+    return {std::move(dict_data), static_cast<int32_t>(dict_size * sizeof(int32_t)),
+            static_cast<size_t>(dict_size)};
+}
+
+// ============================================================================
+// Helper: Build RLE-encoded dict index data
+// ============================================================================
+// Generates RLE-encoded data for num_values dict indexes in [0, dict_size).
+// The first byte is the bit_width, followed by the RLE-encoded data.
+// Returns a vector<uint8_t> that can be used as the data slice.
+static std::vector<uint8_t> build_rle_dict_indexes(int num_values, int dict_size,
+                                                   unsigned seed = 123) {
+    std::mt19937 rng(seed);
+    int bit_width = 0;
+    int tmp = dict_size - 1;
+    while (tmp > 0) {
+        bit_width++;
+        tmp >>= 1;
+    }
+    if (bit_width == 0) bit_width = 1;
+
+    // Use RleEncoder to generate proper RLE data
+    faststring buffer;
+    RleEncoder<uint32_t> encoder(&buffer, bit_width);
+    for (int i = 0; i < num_values; ++i) {
+        encoder.Put(rng() % dict_size);
+    }
+    encoder.Flush();
+
+    // Build the final data: [bit_width_byte] [rle_data...]
+    std::vector<uint8_t> result;
+    result.reserve(1 + buffer.size());
+    result.push_back(static_cast<uint8_t>(bit_width));
+    result.insert(result.end(), buffer.data(), buffer.data() + buffer.size());
+    return result;
+}
+
+// ============================================================================
+// Helper: Build run_length_null_map for ColumnSelectVector
+// ============================================================================
+// The map uses uint16_t entries in alternating pattern: [content, null, content, null, ...]
+// Since uint16_t max is 65535, we need to split large num_values into multiple chunks.
+// For benchmarks we have no nulls, so we use [chunk, 0, chunk, 0, ...] pattern.
+static std::vector<uint16_t> build_run_length_null_map(int num_values) {
+    std::vector<uint16_t> result;
+    int remaining = num_values;
+    while (remaining > 0) {
+        uint16_t chunk = static_cast<uint16_t>(std::min(remaining, 65535));
+        if (!result.empty()) {
+            // Need a 0-length null entry before the next content entry
+            result.push_back(0);
+        }
+        result.push_back(chunk);
+        remaining -= chunk;
+    }
+    return result;
+}
+
+// ============================================================================
+// Helper: Build filter bitmap with given selectivity
+// ============================================================================
+static std::vector<uint8_t> build_filter_bitmap(int num_values, double selectivity,
+                                                unsigned seed = 456) {
+    std::mt19937 rng(seed);
+    std::vector<uint8_t> filter(num_values);
+    std::uniform_real_distribution<double> dist(0.0, 1.0);
+    for (int i = 0; i < num_values; ++i) {
+        filter[i] = dist(rng) < selectivity ? 1 : 0;
+    }
+    return filter;
+}
+
+// ============================================================================
+// ByteArrayDictDecoder Benchmark: No Filter vs With Filter
+// ============================================================================
+// Args: (dict_size, selectivity_percent, num_values_k)
+// selectivity_percent: e.g. 5 means 5% rows survive
+// num_values_k: number of values in thousands
+
+static void BM_ByteArrayDictDecode_NoFilter(benchmark::State& state) {
+    int dict_size = static_cast<int>(state.range(0));
+    double selectivity = state.range(1) / 100.0;
+    int num_values = static_cast<int>(state.range(2)) * 1000;
+    int avg_str_len = 32;
+
+    // Setup decoder and dict
+    auto [dict_buf, dict_len, dict_count] = build_string_dict(dict_size, avg_str_len);
+    auto rle_data = build_rle_dict_indexes(num_values, dict_size);
+
+    // Build filter map (selectivity-based)
+    auto filter_bitmap = build_filter_bitmap(num_values, selectivity);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        ByteArrayDictDecoder decoder;
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+
+        MutableColumnPtr column = ColumnString::create();
+        DataTypePtr data_type = std::make_shared<DataTypeString>();
+
+        // Init ColumnSelectVector with filter
+        auto run_length_null_map = build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        static_cast<void>(filter_map.init(filter_bitmap.data(), filter_bitmap.size(), false));
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        // Decode WITHOUT filter_data pushdown (original path)
+        auto status = decoder.decode_values(column, data_type, select_vector, false, nullptr);
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("dict=" + std::to_string(dict_size) + " sel=" + std::to_string(state.range(1)) +
+                   "%" + " rows=" + std::to_string(num_values));
+}
+
+static void BM_ByteArrayDictDecode_WithFilter(benchmark::State& state) {
+    int dict_size = static_cast<int>(state.range(0));
+    double selectivity = state.range(1) / 100.0;
+    int num_values = static_cast<int>(state.range(2)) * 1000;
+    int avg_str_len = 32;
+
+    // Setup decoder and dict
+    auto [dict_buf, dict_len, dict_count] = build_string_dict(dict_size, avg_str_len);
+    auto rle_data = build_rle_dict_indexes(num_values, dict_size);
+
+    // Build filter map (selectivity-based)
+    auto filter_bitmap = build_filter_bitmap(num_values, selectivity);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        ByteArrayDictDecoder decoder;
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+
+        MutableColumnPtr column = ColumnString::create();
+        DataTypePtr data_type = std::make_shared<DataTypeString>();
+
+        // Init ColumnSelectVector with filter
+        auto run_length_null_map = build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        static_cast<void>(filter_map.init(filter_bitmap.data(), filter_bitmap.size(), false));
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        // Decode WITH filter_data pushdown (optimized path)
+        auto status = decoder.decode_values(column, data_type, select_vector, false,
+                                            filter_bitmap.data());
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("dict=" + std::to_string(dict_size) + " sel=" + std::to_string(state.range(1)) +
+                   "%" + " rows=" + std::to_string(num_values));
+}
+
+// ============================================================================
+// FixLengthDictDecoder<INT32> Benchmark: No Filter vs With Filter
+// ============================================================================
+
+static void BM_FixLenDictDecode_NoFilter(benchmark::State& state) {
+    int dict_size = static_cast<int>(state.range(0));
+    double selectivity = state.range(1) / 100.0;
+    int num_values = static_cast<int>(state.range(2)) * 1000;
+
+    auto [dict_buf, dict_len, dict_count] = build_int32_dict(dict_size);
+    auto rle_data = build_rle_dict_indexes(num_values, dict_size);
+    auto filter_bitmap = build_filter_bitmap(num_values, selectivity);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        FixLengthDictDecoder<tparquet::Type::INT32> decoder;
+        decoder.set_type_length(sizeof(int32_t));
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+
+        MutableColumnPtr column = ColumnInt32::create();
+        DataTypePtr data_type = std::make_shared<DataTypeInt32>();
+
+        auto run_length_null_map = build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        static_cast<void>(filter_map.init(filter_bitmap.data(), filter_bitmap.size(), false));
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        auto status = decoder.decode_values(column, data_type, select_vector, false, nullptr);
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("dict=" + std::to_string(dict_size) + " sel=" + std::to_string(state.range(1)) +
+                   "%" + " rows=" + std::to_string(num_values));
+}
+
+static void BM_FixLenDictDecode_WithFilter(benchmark::State& state) {
+    int dict_size = static_cast<int>(state.range(0));
+    double selectivity = state.range(1) / 100.0;
+    int num_values = static_cast<int>(state.range(2)) * 1000;
+
+    auto [dict_buf, dict_len, dict_count] = build_int32_dict(dict_size);
+    auto rle_data = build_rle_dict_indexes(num_values, dict_size);
+    auto filter_bitmap = build_filter_bitmap(num_values, selectivity);
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        FixLengthDictDecoder<tparquet::Type::INT32> decoder;
+        decoder.set_type_length(sizeof(int32_t));
+        {
+            auto dict_copy = make_unique_buffer<uint8_t>(dict_len);
+            memcpy(dict_copy.get(), dict_buf.get(), dict_len);
+            static_cast<void>(decoder.set_dict(dict_copy, dict_len, dict_count));
+        }
+
+        Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
+        static_cast<void>(decoder.set_data(&data_slice));
+
+        MutableColumnPtr column = ColumnInt32::create();
+        DataTypePtr data_type = std::make_shared<DataTypeInt32>();
+
+        auto run_length_null_map = build_run_length_null_map(num_values);
+        FilterMap filter_map;
+        static_cast<void>(filter_map.init(filter_bitmap.data(), filter_bitmap.size(), false));
+        ColumnSelectVector select_vector;
+        static_cast<void>(
+                select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0));
+        state.ResumeTiming();
+
+        auto status = decoder.decode_values(column, data_type, select_vector, false,
+                                            filter_bitmap.data());
+        benchmark::DoNotOptimize(column);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+    state.SetLabel("dict=" + std::to_string(dict_size) + " sel=" + std::to_string(state.range(1)) +
+                   "%" + " rows=" + std::to_string(num_values));
+}
+
+// ============================================================================
+// RleBatchDecoder SkipBatch Benchmark: Old (GetBatch+discard) vs New (SkipBatch)
+// ============================================================================
+
+static void BM_RleSkip_GetBatch(benchmark::State& state) {
+    int num_values = static_cast<int>(state.range(0)) * 1000;
+    int dict_size = 1000;
+
+    auto rle_data = build_rle_dict_indexes(num_values, dict_size);
+    uint8_t bit_width = rle_data[0];
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        RleBatchDecoder<uint32_t> decoder(rle_data.data() + 1,
+                                          static_cast<int>(rle_data.size()) - 1, bit_width);
+        // Old approach: allocate buffer + GetBatch then discard
+        std::vector<uint32_t> discard_buf(num_values);
+        state.ResumeTiming();
+
+        decoder.GetBatch(discard_buf.data(), num_values);
+        benchmark::DoNotOptimize(discard_buf);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+}
+
+static void BM_RleSkip_SkipBatch(benchmark::State& state) {
+    int num_values = static_cast<int>(state.range(0)) * 1000;
+    int dict_size = 1000;
+
+    auto rle_data = build_rle_dict_indexes(num_values, dict_size);
+    uint8_t bit_width = rle_data[0];
+
+    for (auto _ : state) {
+        state.PauseTiming();
+        RleBatchDecoder<uint32_t> decoder(rle_data.data() + 1,
+                                          static_cast<int>(rle_data.size()) - 1, bit_width);
+        state.ResumeTiming();
+
+        decoder.SkipBatch(num_values);
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * num_values);
+}
+
+// ============================================================================
+// Benchmark Registrations
+// ============================================================================
+
+// --- ByteArrayDictDecoder ---
+// Args: (dict_size, selectivity_percent, num_values_in_thousands)
+
+// Small dict (fits in L2 cache), various selectivities
+BENCHMARK(BM_ByteArrayDictDecode_NoFilter)
+        ->Args({100, 5, 100})
+        ->Args({100, 20, 100})
+        ->Args({100, 50, 100})
+        ->Args({100, 100, 100})
+        ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_ByteArrayDictDecode_WithFilter)
+        ->Args({100, 5, 100})
+        ->Args({100, 20, 100})
+        ->Args({100, 50, 100})
+        ->Args({100, 100, 100})
+        ->Unit(benchmark::kMicrosecond);
+
+// Large dict (exceeds L2 cache), various selectivities
+// 100K entries × 32 bytes ≈ 3.2MB > typical L2 cache (256KB-1MB)
+BENCHMARK(BM_ByteArrayDictDecode_NoFilter)
+        ->Args({100000, 1, 100})
+        ->Args({100000, 5, 100})
+        ->Args({100000, 20, 100})
+        ->Args({100000, 50, 100})
+        ->Args({100000, 100, 100})
+        ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_ByteArrayDictDecode_WithFilter)
+        ->Args({100000, 1, 100})
+        ->Args({100000, 5, 100})
+        ->Args({100000, 20, 100})
+        ->Args({100000, 50, 100})
+        ->Args({100000, 100, 100})
+        ->Unit(benchmark::kMicrosecond);
+
+// Medium dict (borderline L2 cache)
+BENCHMARK(BM_ByteArrayDictDecode_NoFilter)
+        ->Args({10000, 5, 100})
+        ->Args({10000, 20, 100})
+        ->Args({10000, 50, 100})
+        ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_ByteArrayDictDecode_WithFilter)
+        ->Args({10000, 5, 100})
+        ->Args({10000, 20, 100})
+        ->Args({10000, 50, 100})
+        ->Unit(benchmark::kMicrosecond);
+
+// --- FixLengthDictDecoder<INT32> ---
+
+// Small dict
+BENCHMARK(BM_FixLenDictDecode_NoFilter)
+        ->Args({100, 5, 100})
+        ->Args({100, 20, 100})
+        ->Args({100, 50, 100})
+        ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_FixLenDictDecode_WithFilter)
+        ->Args({100, 5, 100})
+        ->Args({100, 20, 100})
+        ->Args({100, 50, 100})
+        ->Unit(benchmark::kMicrosecond);
+
+// Large dict (exceeds L2 cache)
+// 100K entries × 4 bytes = 400KB (still might fit in L2 for large caches)
+// Use 1M entries for guaranteed L2 miss: 1M × 4 bytes = 4MB
+BENCHMARK(BM_FixLenDictDecode_NoFilter)
+        ->Args({1000000, 5, 100})
+        ->Args({1000000, 20, 100})
+        ->Args({1000000, 50, 100})
+        ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_FixLenDictDecode_WithFilter)
+        ->Args({1000000, 5, 100})
+        ->Args({1000000, 20, 100})
+        ->Args({1000000, 50, 100})
+        ->Unit(benchmark::kMicrosecond);
+
+// --- RLE SkipBatch ---
+BENCHMARK(BM_RleSkip_GetBatch)
+        ->Args({10})
+        ->Args({100})
+        ->Args({1000})
+        ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_RleSkip_SkipBatch)
+        ->Args({10})
+        ->Args({100})
+        ->Args({1000})
+        ->Unit(benchmark::kMicrosecond);
+
+} // namespace doris::vectorized
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 1f01ff6f23765c..ef9a6915003f7e 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1595,6 +1595,23 @@ DEFINE_mInt64(compaction_batch_size, "-1");
 // filter wrong data.
 DEFINE_mBool(enable_parquet_page_index, "true");
 
+// Whether to push down filter bitmap to the parquet decoder layer for lazy index decoding.
+DEFINE_mBool(enable_parquet_lazy_dict_decode, "true");
+
+// Whether to enable predicate column read order optimization in parquet lazy read.
+DEFINE_mBool(enable_parquet_predicate_column_reorder, "true");
+
+// Whether to enable lazy dictionary decode for non-predicate (lazy) string columns in parquet.
+DEFINE_mBool(enable_parquet_lazy_dict_decode_for_lazy_columns, "true");
+
+// Whether to enable AVX2 SIMD dict gather in parquet dictionary decoding.
+// Benchmark shows SIMD gather is slower than scalar for most dict sizes on Alder Lake.
+DEFINE_mBool(enable_parquet_simd_dict_decode, "false");
+
+// Whether to enable software prefetch hints for large dictionary decoding in parquet.
+// Benchmark shows software prefetch competes with hardware prefetcher, causing regression.
+DEFINE_mBool(enable_parquet_dict_prefetch, "false");
+
 DEFINE_mBool(ignore_not_found_file_in_external_table, "true");
 
 DEFINE_mBool(enable_hdfs_mem_limiter, "true");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index fb85d142ffeaa7..9aa38a055e421a 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1666,6 +1666,30 @@ DECLARE_mInt64(compaction_batch_size);
 
 DECLARE_mBool(enable_parquet_page_index);
 
+// Whether to push down filter bitmap to the parquet decoder layer for lazy index decoding.
+// When enabled and selectivity is low, FILTERED_CONTENT runs skip RLE index decoding
+// instead of decoding all indexes upfront.
+DECLARE_mBool(enable_parquet_lazy_dict_decode);
+
+// Whether to enable predicate column read order optimization in parquet lazy read.
+// When enabled, predicate columns are read one by one with intermediate filtering,
+// so highly-selective columns filter rows early, reducing decode work for subsequent columns.
+DECLARE_mBool(enable_parquet_predicate_column_reorder);
+
+// Whether to enable lazy dictionary decode for non-predicate (lazy) string columns in parquet.
+// When enabled, lazy string columns that are fully dictionary-encoded output int32 dict codes
+// during Phase 2 read, then convert to strings only for rows surviving the filter.
+DECLARE_mBool(enable_parquet_lazy_dict_decode_for_lazy_columns);
+
+// Whether to enable AVX2 SIMD dict gather in parquet dictionary decoding.
+// When enabled, INT32/FLOAT uses 8-wide AVX2 gather, INT64/DOUBLE uses 4-wide gather.
+DECLARE_mBool(enable_parquet_simd_dict_decode);
+
+// Whether to enable software prefetch hints for large dictionary decoding in parquet.
+// When enabled and dictionary exceeds L2 cache threshold, prefetch hints are emitted
+// to hide cache miss latency during dict gather (both SIMD and scalar paths).
+DECLARE_mBool(enable_parquet_dict_prefetch);
+
 // Wheather to ignore not found file in external teble(eg, hive)
 // Default is true, if set to false, the not found file will result in query failure.
 DECLARE_mBool(ignore_not_found_file_in_external_table);
diff --git a/be/src/util/rle_encoding.h b/be/src/util/rle_encoding.h
index 1685ab85f1efe1..530baeff8a35d4 100644
--- a/be/src/util/rle_encoding.h
+++ b/be/src/util/rle_encoding.h
@@ -699,6 +699,10 @@ class RleBatchDecoder {
     // Returns the number of consumed values or 0 if an error occurred.
     uint32_t GetBatch(T* values, uint32_t batch_num);
 
+    // Skip 'num_values' values without writing to any buffer.
+    // Returns the number of values actually skipped.
+    uint32_t SkipBatch(uint32_t num_values);
+
 private:
     // Called when both 'literal_count_' and 'repeat_count_' have been exhausted.
     // Sets either 'literal_count_' or 'repeat_count_' to the size of the next literal
@@ -885,5 +889,74 @@ uint32_t RleBatchDecoder<T>::GetBatch(T* values, uint32_t batch_num) {
     }
     return num_consumed;
 }
+
+template <typename T>
+uint32_t RleBatchDecoder<T>::SkipBatch(uint32_t num_values) {
+    DCHECK_GT(num_values, 0u);
+    uint32_t num_skipped = 0;
+    while (num_skipped < num_values) {
+        // Try to skip from repeated run first.
+        uint32_t num_repeats = NextNumRepeats();
+        if (num_repeats > 0) {
+            uint32_t to_skip = std::min(num_repeats, num_values - num_skipped);
+            // Consume repeats without writing any values.
+            GetRepeatedValue(to_skip);
+            num_skipped += to_skip;
+            continue;
+        }
+
+        // Try to skip from literal run.
+        uint32_t num_literals = NextNumLiterals();
+        if (num_literals == 0) {
+            // No more data.
+            break;
+        }
+        uint32_t to_skip = std::min(num_literals, num_values - num_skipped);
+        // Skip literals from the bit reader.
+        // First, consume any already-buffered literals.
+        if (HaveBufferedLiterals()) {
+            uint32_t buffered_skip = std::min(
+                    to_skip, static_cast<uint32_t>(num_buffered_literals_ - literal_buffer_pos_));
+            literal_buffer_pos_ += buffered_skip;
+            literal_count_ -= buffered_skip;
+            to_skip -= buffered_skip;
+            num_skipped += buffered_skip;
+        }
+        // For remaining literals, skip using the same approach as GetLiteralValues:
+        // 1. Skip in multiples of 32 via bit_reader_.SkipBatch (always byte-aligned).
+        // 2. Buffer the remainder via FillLiteralBuffer, then advance buffer position.
+        // This is necessary because BatchedBitReader::SkipBatch requires
+        // (bit_width * num_values) to be divisible by 8, which is guaranteed for
+        // multiples of 32 but not for arbitrary counts.
+        if (to_skip > 0 && literal_count_ > 0) {
+            uint32_t direct_skip = std::min(to_skip, static_cast<uint32_t>(literal_count_));
+            // Skip in multiples of 32 (byte-aligned) directly in the bit reader.
+            int32_t num_to_bypass = std::min<int32_t>(
+                    literal_count_, BitUtil::RoundDownToPowerOf2(static_cast<int32_t>(direct_skip),
+                                                                 static_cast<int32_t>(32)));
+            if (num_to_bypass > 0) {
+                if (UNLIKELY(!bit_reader_.SkipBatch(bit_width_, num_to_bypass))) {
+                    return num_skipped;
+                }
+                literal_count_ -= num_to_bypass;
+                direct_skip -= num_to_bypass;
+                num_skipped += num_to_bypass;
+            }
+            // For any remainder (< 32 values), buffer them and advance past.
+            if (direct_skip > 0 && literal_count_ > 0) {
+                if (UNLIKELY(!FillLiteralBuffer())) {
+                    return num_skipped;
+                }
+                uint32_t buffered_skip = std::min(
+                        direct_skip,
+                        static_cast<uint32_t>(num_buffered_literals_ - literal_buffer_pos_));
+                literal_buffer_pos_ += buffered_skip;
+                literal_count_ -= buffered_skip;
+                num_skipped += buffered_skip;
+            }
+        }
+    }
+    return num_skipped;
+}
 #include "common/compile_check_end.h"
 } // namespace doris
diff --git a/be/src/vec/exec/format/parquet/bool_plain_decoder.cpp b/be/src/vec/exec/format/parquet/bool_plain_decoder.cpp
index e4b7be36884ec1..6ec42e25e119e6 100644
--- a/be/src/vec/exec/format/parquet/bool_plain_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/bool_plain_decoder.cpp
@@ -54,7 +54,8 @@ Status BoolPlainDecoder::skip_values(size_t num_values) {
 }
 
 Status BoolPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                                       ColumnSelectVector& select_vector, bool is_dict_filter) {
+                                       ColumnSelectVector& select_vector, bool is_dict_filter,
+                                       const uint8_t* filter_data) {
     if (select_vector.has_filter()) {
         return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
     } else {
diff --git a/be/src/vec/exec/format/parquet/bool_plain_decoder.h b/be/src/vec/exec/format/parquet/bool_plain_decoder.h
index f33f79be154e55..d0680199eb1f4c 100644
--- a/be/src/vec/exec/format/parquet/bool_plain_decoder.h
+++ b/be/src/vec/exec/format/parquet/bool_plain_decoder.h
@@ -55,7 +55,8 @@ class BoolPlainDecoder final : public Decoder {
     }
 
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter) override;
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr) override;
 
     template <bool has_filter>
     Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
diff --git a/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp b/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp
index 645b9710251bf9..eff3b0a9fdf951 100644
--- a/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp
@@ -58,7 +58,8 @@ Status BoolRLEDecoder::skip_values(size_t num_values) {
 }
 
 Status BoolRLEDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                                     ColumnSelectVector& select_vector, bool is_dict_filter) {
+                                     ColumnSelectVector& select_vector, bool is_dict_filter,
+                                     const uint8_t* filter_data) {
     if (select_vector.has_filter()) {
         return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
     } else {
diff --git a/be/src/vec/exec/format/parquet/bool_rle_decoder.h b/be/src/vec/exec/format/parquet/bool_rle_decoder.h
index 14028d72320243..3064f7028c7e56 100644
--- a/be/src/vec/exec/format/parquet/bool_rle_decoder.h
+++ b/be/src/vec/exec/format/parquet/bool_rle_decoder.h
@@ -44,7 +44,8 @@ class BoolRLEDecoder final : public Decoder {
     Status set_data(Slice* slice) override;
 
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter) override;
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr) override;
 
     template <bool has_filter>
     Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
index 49ab5cd584bb09..ab8cf7242f660f 100644
--- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp
@@ -19,6 +19,8 @@
 
 #include <utility>
 
+#include "common/compiler_util.h"
+#include "common/config.h"
 #include "util/coding.h"
 #include "util/rle_encoding.h"
 #include "vec/columns/column.h"
@@ -69,6 +71,12 @@ Status ByteArrayDictDecoder::set_dict(DorisUniqueBufferPtr<uint8_t>& dict, int32
     if (offset_cursor != length) {
         return Status::Corruption("Wrong dictionary data for byte array type");
     }
+    // P1-5: Check if dictionary data exceeds L2 cache threshold.
+    // For string dicts, the relevant size is _dict_items (StringRef array) + _dict_data (string bodies).
+    // Typical L2 cache: 256KB-1MB per core. Use 256KB as conservative threshold.
+    constexpr size_t L2_CACHE_THRESHOLD = 256 * 1024;
+    size_t dict_memory = _dict_items.size() * sizeof(StringRef) + _dict_data.size();
+    _dict_exceeds_l2_cache = dict_memory > L2_CACHE_THRESHOLD;
     return Status::OK();
 }
 
@@ -91,18 +99,21 @@ MutableColumnPtr ByteArrayDictDecoder::convert_dict_column_to_string_column(
 }
 
 Status ByteArrayDictDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                                           ColumnSelectVector& select_vector, bool is_dict_filter) {
+                                           ColumnSelectVector& select_vector, bool is_dict_filter,
+                                           const uint8_t* filter_data) {
     if (select_vector.has_filter()) {
-        return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
+        return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter,
+                                    filter_data);
     } else {
-        return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
+        return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter,
+                                     nullptr);
     }
 }
 
 template <bool has_filter>
 Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                                            ColumnSelectVector& select_vector,
-                                            bool is_dict_filter) {
+                                            ColumnSelectVector& select_vector, bool is_dict_filter,
+                                            const uint8_t* filter_data) {
     size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
     if (doris_column->is_column_dictionary()) {
         ColumnDictI32& dict_column = assert_cast<ColumnDictI32&>(*doris_column);
@@ -113,6 +124,21 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data
                                               cast_set<uint32_t>(_dict_items.size()));
         }
     }
+
+    // When filter_data is provided and has_filter is true, use lazy index decoding:
+    // decode indexes per-run and skip FILTERED_CONTENT via SkipBatch.
+    // This avoids decoding RLE indexes for rows that will be discarded.
+    if constexpr (has_filter) {
+        if (filter_data != nullptr) {
+            if (doris_column->is_column_dictionary() || is_dict_filter) {
+                // For dict-filter path, we still need all indexes.
+                // Fall through to bulk decode below.
+            } else {
+                return _lazy_decode_string_values(doris_column, select_vector);
+            }
+        }
+    }
+
     _indexes.resize(non_null_size);
     _index_batch_decoder->GetBatch(_indexes.data(), cast_set<uint32_t>(non_null_size));
 
@@ -126,13 +152,42 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data
     while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
         switch (read_type) {
         case ColumnSelectVector::CONTENT: {
-            std::vector<StringRef> string_values;
-            string_values.reserve(run_length);
-            for (size_t i = 0; i < run_length; ++i) {
-                string_values.emplace_back(_dict_items[_indexes[dict_index++]]);
+            if (config::enable_parquet_simd_dict_decode) {
+                // P1-4: Use reusable buffer to avoid per-run heap allocation.
+                _string_values_buf.resize(run_length);
+                constexpr size_t PREFETCH_DISTANCE = 8;
+                for (size_t i = 0; i < run_length; ++i) {
+                    // P1-5: Software prefetch for large dictionaries (separate config)
+                    if (_dict_exceeds_l2_cache && config::enable_parquet_dict_prefetch &&
+                        i + PREFETCH_DISTANCE < run_length) {
+                        PREFETCH(&_dict_items[_indexes[dict_index + PREFETCH_DISTANCE]]);
+                    }
+                    _string_values_buf[i] = _dict_items[_indexes[dict_index++]];
+                }
+                doris_column->insert_many_strings_overflow(_string_values_buf.data(), run_length,
+                                                           _max_value_length);
+            } else if (_dict_exceeds_l2_cache && config::enable_parquet_dict_prefetch) {
+                // P1-5 only: scalar path with software prefetch for large dicts
+                std::vector<StringRef> string_values;
+                string_values.reserve(run_length);
+                constexpr size_t PREFETCH_DISTANCE = 8;
+                for (size_t i = 0; i < run_length; ++i) {
+                    if (i + PREFETCH_DISTANCE < run_length) {
+                        PREFETCH(&_dict_items[_indexes[dict_index + PREFETCH_DISTANCE]]);
+                    }
+                    string_values.emplace_back(_dict_items[_indexes[dict_index++]]);
+                }
+                doris_column->insert_many_strings_overflow(string_values.data(), run_length,
+                                                           _max_value_length);
+            } else {
+                std::vector<StringRef> string_values;
+                string_values.reserve(run_length);
+                for (size_t i = 0; i < run_length; ++i) {
+                    string_values.emplace_back(_dict_items[_indexes[dict_index++]]);
+                }
+                doris_column->insert_many_strings_overflow(string_values.data(), run_length,
+                                                           _max_value_length);
             }
-            doris_column->insert_many_strings_overflow(string_values.data(), run_length,
-                                                       _max_value_length);
             break;
         }
         case ColumnSelectVector::NULL_DATA: {
@@ -151,6 +206,70 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data
     }
     return Status::OK();
 }
+Status ByteArrayDictDecoder::_lazy_decode_string_values(MutableColumnPtr& doris_column,
+                                                        ColumnSelectVector& select_vector) {
+    ColumnSelectVector::DataReadType read_type;
+    while (size_t run_length = select_vector.get_next_run<true>(&read_type)) {
+        switch (read_type) {
+        case ColumnSelectVector::CONTENT: {
+            // Decode only the indexes needed for this CONTENT run.
+            _indexes.resize(run_length);
+            _index_batch_decoder->GetBatch(_indexes.data(), cast_set<uint32_t>(run_length));
+            if (config::enable_parquet_simd_dict_decode) {
+                // P1-4: Reusable buffer + P1-5: software prefetch for lazy path
+                _string_values_buf.resize(run_length);
+                constexpr size_t PREFETCH_DISTANCE = 8;
+                for (size_t i = 0; i < run_length; ++i) {
+                    if (_dict_exceeds_l2_cache && config::enable_parquet_dict_prefetch &&
+                        i + PREFETCH_DISTANCE < run_length) {
+                        PREFETCH(&_dict_items[_indexes[i + PREFETCH_DISTANCE]]);
+                    }
+                    _string_values_buf[i] = _dict_items[_indexes[i]];
+                }
+                doris_column->insert_many_strings_overflow(_string_values_buf.data(), run_length,
+                                                           _max_value_length);
+            } else if (_dict_exceeds_l2_cache && config::enable_parquet_dict_prefetch) {
+                // P1-5 only: scalar path with software prefetch for lazy path
+                std::vector<StringRef> string_values;
+                string_values.reserve(run_length);
+                constexpr size_t PREFETCH_DISTANCE = 8;
+                for (size_t i = 0; i < run_length; ++i) {
+                    if (i + PREFETCH_DISTANCE < run_length) {
+                        PREFETCH(&_dict_items[_indexes[i + PREFETCH_DISTANCE]]);
+                    }
+                    string_values.emplace_back(_dict_items[_indexes[i]]);
+                }
+                doris_column->insert_many_strings_overflow(string_values.data(), run_length,
+                                                           _max_value_length);
+            } else {
+                std::vector<StringRef> string_values;
+                string_values.reserve(run_length);
+                for (size_t i = 0; i < run_length; ++i) {
+                    string_values.emplace_back(_dict_items[_indexes[i]]);
+                }
+                doris_column->insert_many_strings_overflow(string_values.data(), run_length,
+                                                           _max_value_length);
+            }
+            break;
+        }
+        case ColumnSelectVector::NULL_DATA: {
+            doris_column->insert_many_defaults(run_length);
+            break;
+        }
+        case ColumnSelectVector::FILTERED_CONTENT: {
+            // Skip indexes in the RLE stream without decoding them.
+            _index_batch_decoder->SkipBatch(cast_set<uint32_t>(run_length));
+            break;
+        }
+        case ColumnSelectVector::FILTERED_NULL: {
+            // No indexes to skip for null values.
+            break;
+        }
+        }
+    }
+    return Status::OK();
+}
+
 #include "common/compile_check_end.h"
 
 } // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h
index 762a9c5b885d83..0d34de033f938d 100644
--- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h
+++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h
@@ -43,11 +43,13 @@ class ByteArrayDictDecoder final : public BaseDictDecoder {
     ~ByteArrayDictDecoder() override = default;
 
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter) override;
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr) override;
 
     template <bool has_filter>
     Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                          ColumnSelectVector& select_vector, bool is_dict_filter);
+                          ColumnSelectVector& select_vector, bool is_dict_filter,
+                          const uint8_t* filter_data);
 
     Status set_dict(DorisUniqueBufferPtr<uint8_t>& dict, int32_t length,
                     size_t num_values) override;
@@ -57,10 +59,18 @@ class ByteArrayDictDecoder final : public BaseDictDecoder {
     MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override;
 
 protected:
+    // Lazy index decoding path: decode indexes per-run, skip FILTERED_CONTENT via SkipBatch.
+    Status _lazy_decode_string_values(MutableColumnPtr& doris_column,
+                                      ColumnSelectVector& select_vector);
+
     // For dictionary encoding
     std::vector<StringRef> _dict_items;
     std::vector<uint8_t> _dict_data;
     size_t _max_value_length;
+    // P1-4: Reusable buffer for string dict gather to avoid per-run heap allocation.
+    std::vector<StringRef> _string_values_buf;
+    // P1-5: Whether dictionary exceeds L2 cache threshold (triggers software prefetching)
+    bool _dict_exceeds_l2_cache = false;
 };
 #include "common/compile_check_end.h"
 
diff --git a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp
index 7092a4fb2924e7..cc667ef6d58856 100644
--- a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp
@@ -43,8 +43,8 @@ Status ByteArrayPlainDecoder::skip_values(size_t num_values) {
 }
 
 Status ByteArrayPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                                            ColumnSelectVector& select_vector,
-                                            bool is_dict_filter) {
+                                            ColumnSelectVector& select_vector, bool is_dict_filter,
+                                            const uint8_t* filter_data) {
     if (select_vector.has_filter()) {
         return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
     } else {
diff --git a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h
index 9a6c69834f5a65..8ef80a0eef3511 100644
--- a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h
+++ b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h
@@ -48,7 +48,8 @@ class ByteArrayPlainDecoder final : public Decoder {
     ~ByteArrayPlainDecoder() override = default;
 
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter) override;
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr) override;
 
     template <bool has_filter>
     Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
diff --git a/be/src/vec/exec/format/parquet/byte_stream_split_decoder.cpp b/be/src/vec/exec/format/parquet/byte_stream_split_decoder.cpp
index 931198881afdc3..b158a94901afed 100644
--- a/be/src/vec/exec/format/parquet/byte_stream_split_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/byte_stream_split_decoder.cpp
@@ -24,8 +24,8 @@
 namespace doris::vectorized {
 #include "common/compile_check_begin.h"
 Status ByteStreamSplitDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                                             ColumnSelectVector& select_vector,
-                                             bool is_dict_filter) {
+                                             ColumnSelectVector& select_vector, bool is_dict_filter,
+                                             const uint8_t* filter_data) {
     if (select_vector.has_filter()) {
         return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
     } else {
diff --git a/be/src/vec/exec/format/parquet/byte_stream_split_decoder.h b/be/src/vec/exec/format/parquet/byte_stream_split_decoder.h
index 4d62aed025fcea..9bb417f0b246e7 100644
--- a/be/src/vec/exec/format/parquet/byte_stream_split_decoder.h
+++ b/be/src/vec/exec/format/parquet/byte_stream_split_decoder.h
@@ -27,7 +27,8 @@ class ByteStreamSplitDecoder final : public Decoder {
     ~ByteStreamSplitDecoder() override = default;
 
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter) override;
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr) override;
 
     template <bool has_filter>
     Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
diff --git a/be/src/vec/exec/format/parquet/column_read_order_ctx.h b/be/src/vec/exec/format/parquet/column_read_order_ctx.h
new file mode 100644
index 00000000000000..665e40024613f2
--- /dev/null
+++ b/be/src/vec/exec/format/parquet/column_read_order_ctx.h
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace doris::vectorized {
+
+/// Manages the read order of predicate columns in lazy-read mode.
+///
+/// During the first EXPLORATION_ROUNDS batches, it tries random column orders
+/// and records which order yields the lowest "round cost" (i.e., fewest rows
+/// that survive after each column's filter). After exploration, it locks in
+/// the best order found and uses it for all subsequent batches.
+class ColumnReadOrderCtx {
+public:
+    /// @param col_indices  Indices into the predicate_columns arrays (0-based).
+    /// @param col_cost_map Index -> estimated per-row decode cost (e.g., type_length).
+    /// @param total_cost   Sum of all column costs (initial upper bound for round cost).
+    ColumnReadOrderCtx(std::vector<size_t> col_indices,
+                       std::unordered_map<size_t, size_t> col_cost_map, size_t total_cost)
+            : _best_order(std::move(col_indices)),
+              _col_cost_map(std::move(col_cost_map)),
+              _min_round_cost(total_cost) {}
+
+    /// Returns the column read order for the current batch.
+    /// During exploration, returns a random permutation; afterwards, the best order.
+    const std::vector<size_t>& get_column_read_order() {
+        if (_exploration_remaining > 0) {
+            _trying_order = _best_order;
+            std::shuffle(_trying_order.begin(), _trying_order.end(),
+                         std::mt19937(std::random_device {}()));
+            return _trying_order;
+        }
+        return _best_order;
+    }
+
+    /// Called after each batch to record cost metrics.
+    /// @param round_cost      Accumulated cost for this batch (weighted by rows decoded).
+    /// @param first_selectivity  Fraction of rows surviving after the first column's filter.
+    void update(size_t round_cost, double first_selectivity) {
+        if (_exploration_remaining > 0) {
+            if (round_cost < _min_round_cost ||
+                (round_cost == _min_round_cost && first_selectivity > 0 &&
+                 first_selectivity < _best_first_selectivity)) {
+                _best_order = _trying_order;
+                _min_round_cost = round_cost;
+                _best_first_selectivity = first_selectivity;
+            }
+            _trying_order.clear();
+            _exploration_remaining--;
+        }
+    }
+
+    size_t get_column_cost(size_t col_index) const {
+        auto it = _col_cost_map.find(col_index);
+        return it != _col_cost_map.end() ? it->second : 0;
+    }
+
+    bool in_exploration() const { return _exploration_remaining > 0; }
+
+private:
+    static constexpr int EXPLORATION_ROUNDS = 10;
+
+    std::vector<size_t> _best_order;
+    std::vector<size_t> _trying_order;
+    std::unordered_map<size_t, size_t> _col_cost_map; // col_index -> per-row cost
+    size_t _min_round_cost;
+    double _best_first_selectivity = 1.0;
+    int _exploration_remaining = EXPLORATION_ROUNDS;
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/decoder.h b/be/src/vec/exec/format/parquet/decoder.h
index 81f328ded4320d..bcf94c5539147c 100644
--- a/be/src/vec/exec/format/parquet/decoder.h
+++ b/be/src/vec/exec/format/parquet/decoder.h
@@ -67,7 +67,8 @@ class Decoder {
 
     // Write the decoded values batch to doris's column
     virtual Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                                 ColumnSelectVector& select_vector, bool is_dict_filter) = 0;
+                                 ColumnSelectVector& select_vector, bool is_dict_filter,
+                                 const uint8_t* filter_data = nullptr) = 0;
 
     virtual Status skip_values(size_t num_values) = 0;
 
@@ -147,8 +148,12 @@ class BaseDictDecoder : public Decoder {
     }
 
     Status skip_values(size_t num_values) override {
-        _indexes.resize(num_values);
-        _index_batch_decoder->GetBatch(_indexes.data(), cast_set<uint32_t>(num_values));
+        auto skipped = _index_batch_decoder->SkipBatch(cast_set<uint32_t>(num_values));
+        if (UNLIKELY(skipped < num_values)) {
+            return Status::InternalError(
+                    "RLE skip error: not enough values to skip, expected {}, got {}", num_values,
+                    skipped);
+        }
         return Status::OK();
     }
 
diff --git a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h
index 9ba03c45288783..58c2584c90f28b 100644
--- a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h
+++ b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h
@@ -152,7 +152,8 @@ class DeltaBitPackDecoder final : public DeltaDecoder {
     }
 
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter) override {
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr) override {
         size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
         // decode values
         _values.resize(non_null_size);
@@ -165,7 +166,7 @@ class DeltaBitPackDecoder final : public DeltaDecoder {
         // set decoded value with fix plain decoder
         RETURN_IF_ERROR(init_values_converter());
         return _type_converted_decoder->decode_values(doris_column, data_type, select_vector,
-                                                      is_dict_filter);
+                                                      is_dict_filter, filter_data);
     }
 
     Status decode(T* buffer, uint32_t num_values, uint32_t* out_num_values) {
@@ -237,7 +238,8 @@ class DeltaLengthByteArrayDecoder final : public DeltaDecoder {
     }
 
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter) override {
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr) override {
         if (select_vector.has_filter()) {
             return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
         } else {
@@ -312,7 +314,8 @@ class DeltaByteArrayDecoder : public DeltaDecoder {
     }
 
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter) override {
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr) override {
         if (select_vector.has_filter()) {
             return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
         } else {
diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
index c47df37c4d15a0..0a21880372bf2c 100644
--- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
+++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
@@ -17,6 +17,12 @@
 
 #pragma once
 
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
+
+#include "common/compiler_util.h"
+#include "common/config.h"
 #include "util/bit_util.h"
 #include "util/memcpy_inlined.h"
 #include "vec/columns/column_dictionary.h"
@@ -68,17 +74,21 @@ class FixLengthDictDecoder final : public BaseDictDecoder {
     ~FixLengthDictDecoder() override = default;
 
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter) override {
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr) override {
         if (select_vector.has_filter()) {
-            return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
+            return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter,
+                                        filter_data);
         } else {
-            return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
+            return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter,
+                                         nullptr);
         }
     }
 
     template <bool has_filter>
     Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                          ColumnSelectVector& select_vector, bool is_dict_filter) {
+                          ColumnSelectVector& select_vector, bool is_dict_filter,
+                          const uint8_t* filter_data) {
         size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
         if (doris_column->is_column_dictionary() &&
             assert_cast<ColumnDictI32&>(*doris_column).dict_size() == 0) {
@@ -94,6 +104,16 @@ class FixLengthDictDecoder final : public BaseDictDecoder {
                     .insert_many_dict_data(dict_items.data(),
                                            cast_set<uint32_t>(dict_items.size()));
         }
+
+        // When filter_data is provided and has_filter is true, use lazy index decoding:
+        // decode indexes per-run and skip FILTERED_CONTENT via SkipBatch.
+        if constexpr (has_filter) {
+            if (filter_data != nullptr && !doris_column->is_column_dictionary() &&
+                !is_dict_filter) {
+                return _lazy_decode_fixed_values(doris_column, data_type, select_vector);
+            }
+        }
+
         _indexes.resize(non_null_size);
         _index_batch_decoder->GetBatch(_indexes.data(), cast_set<uint32_t>(non_null_size));
 
@@ -151,10 +171,27 @@ class FixLengthDictDecoder final : public BaseDictDecoder {
                     }
                     data_index = dst_ptr - raw_data;
                 } else {
-                    // Original path for non-FIXED_LEN_BYTE_ARRAY types
-                    for (size_t i = 0; i < run_length; ++i) {
-                        *(cppType*)(raw_data + data_index) = _dict_items[_indexes[dict_index++]];
-                        data_index += _type_length;
+                    if (config::enable_parquet_simd_dict_decode) {
+                        // P1-4: SIMD dict gather for scalar types (INT32/INT64/FLOAT/DOUBLE)
+                        // P1-5: Software prefetch for large dictionaries
+                        _simd_dict_gather(raw_data, data_index, dict_index, run_length);
+                    } else if (_dict_exceeds_l2_cache && config::enable_parquet_dict_prefetch) {
+                        // P1-5 only: scalar loop with software prefetch for large dicts
+                        constexpr size_t PF_DIST = 8;
+                        for (size_t i = 0; i < run_length; ++i) {
+                            if (i + PF_DIST < run_length) {
+                                PREFETCH(&_dict_items[_indexes[dict_index + i + PF_DIST]]);
+                            }
+                            *(cppType*)(raw_data + data_index) =
+                                    _dict_items[_indexes[dict_index++]];
+                            data_index += _type_length;
+                        }
+                    } else {
+                        for (size_t i = 0; i < run_length; ++i) {
+                            *(cppType*)(raw_data + data_index) =
+                                    _dict_items[_indexes[dict_index++]];
+                            data_index += _type_length;
+                        }
                     }
                 }
                 break;
@@ -176,6 +213,149 @@ class FixLengthDictDecoder final : public BaseDictDecoder {
         return Status::OK();
     }
 
+    // Lazy index decoding path: decode indexes per-run, skip FILTERED_CONTENT via SkipBatch.
+    Status _lazy_decode_fixed_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
+                                     ColumnSelectVector& select_vector) {
+        size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory();
+        size_t data_index = doris_column->size() * primitive_length;
+        size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) *
+                            (_type_length / primitive_length);
+        doris_column->resize(doris_column->size() + scale_size);
+        char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
+
+        ColumnSelectVector::DataReadType read_type;
+        while (size_t run_length = select_vector.get_next_run<true>(&read_type)) {
+            switch (read_type) {
+            case ColumnSelectVector::CONTENT: {
+                // Decode only the indexes needed for this CONTENT run.
+                _indexes.resize(run_length);
+                _index_batch_decoder->GetBatch(_indexes.data(), cast_set<uint32_t>(run_length));
+                if constexpr (PhysicalType == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
+                    char* dst_ptr = raw_data + data_index;
+                    for (size_t i = 0; i < run_length; ++i) {
+                        auto& slice = _dict_items[_indexes[i]];
+                        doris::memcpy_inlined(dst_ptr, slice.get_data(), _type_length);
+                        dst_ptr += _type_length;
+                    }
+                    data_index = dst_ptr - raw_data;
+                } else {
+                    if (config::enable_parquet_simd_dict_decode) {
+                        // P1-4: SIMD dict gather + P1-5: prefetch for lazy decode path
+                        size_t local_dict_index = 0;
+                        _simd_dict_gather(raw_data, data_index, local_dict_index, run_length);
+                    } else if (_dict_exceeds_l2_cache && config::enable_parquet_dict_prefetch) {
+                        // P1-5 only: scalar loop with software prefetch for large dicts
+                        constexpr size_t PF_DIST = 8;
+                        for (size_t i = 0; i < run_length; ++i) {
+                            if (i + PF_DIST < run_length) {
+                                PREFETCH(&_dict_items[_indexes[i + PF_DIST]]);
+                            }
+                            *(cppType*)(raw_data + data_index) = _dict_items[_indexes[i]];
+                            data_index += _type_length;
+                        }
+                    } else {
+                        for (size_t i = 0; i < run_length; ++i) {
+                            *(cppType*)(raw_data + data_index) = _dict_items[_indexes[i]];
+                            data_index += _type_length;
+                        }
+                    }
+                }
+                break;
+            }
+            case ColumnSelectVector::NULL_DATA: {
+                data_index += run_length * _type_length;
+                break;
+            }
+            case ColumnSelectVector::FILTERED_CONTENT: {
+                // Skip indexes in the RLE stream without decoding them.
+                _index_batch_decoder->SkipBatch(cast_set<uint32_t>(run_length));
+                break;
+            }
+            case ColumnSelectVector::FILTERED_NULL: {
+                // No indexes to skip for null values.
+                break;
+            }
+            }
+        }
+        return Status::OK();
+    }
+
+    // P1-4: SIMD dict gather + P1-5: software prefetch for scalar types.
+    // Uses AVX2 gather instructions for INT32/FLOAT (8 values/op) and INT64/DOUBLE (4 values/op).
+    // Falls back to scalar loop with software prefetch for large dictionaries.
+    ALWAYS_INLINE void _simd_dict_gather(char* raw_data, size_t& data_index, size_t& dict_index,
+                                         size_t run_length) {
+        constexpr size_t PREFETCH_DISTANCE = 8;
+        const bool use_prefetch = _dict_exceeds_l2_cache && config::enable_parquet_dict_prefetch;
+
+#ifdef __AVX2__
+        if constexpr (PhysicalType == tparquet::Type::INT32 ||
+                      PhysicalType == tparquet::Type::FLOAT) {
+            // 4-byte types: gather 8 values per AVX2 instruction
+            size_t i = 0;
+            for (; i + 8 <= run_length; i += 8) {
+                if (use_prefetch && i + PREFETCH_DISTANCE + 8 <= run_length) {
+                    PREFETCH(&_dict_items[_indexes[dict_index + i + PREFETCH_DISTANCE]]);
+                    PREFETCH(&_dict_items[_indexes[dict_index + i + PREFETCH_DISTANCE + 4]]);
+                }
+                __m256i indices = _mm256_loadu_si256(
+                        reinterpret_cast<const __m256i*>(&_indexes[dict_index + i]));
+                __m256i gathered = _mm256_i32gather_epi32(
+                        reinterpret_cast<const int*>(_dict_items.data()), indices, 4);
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(raw_data + data_index), gathered);
+                data_index += 32; // 8 × 4 bytes
+            }
+            // Scalar tail
+            for (; i < run_length; ++i) {
+                if (use_prefetch && i + PREFETCH_DISTANCE < run_length) {
+                    PREFETCH(&_dict_items[_indexes[dict_index + i + PREFETCH_DISTANCE]]);
+                }
+                *(cppType*)(raw_data + data_index) = _dict_items[_indexes[dict_index + i]];
+                data_index += _type_length;
+            }
+            dict_index += run_length;
+            return;
+        }
+        if constexpr (PhysicalType == tparquet::Type::INT64 ||
+                      PhysicalType == tparquet::Type::DOUBLE) {
+            // 8-byte types: gather 4 values per AVX2 instruction
+            // _mm256_i32gather_epi64 takes a __m128i of 4 int32 indices
+            size_t i = 0;
+            for (; i + 4 <= run_length; i += 4) {
+                if (use_prefetch && i + PREFETCH_DISTANCE + 4 <= run_length) {
+                    PREFETCH(&_dict_items[_indexes[dict_index + i + PREFETCH_DISTANCE]]);
+                    PREFETCH(&_dict_items[_indexes[dict_index + i + PREFETCH_DISTANCE + 2]]);
+                }
+                __m128i indices = _mm_loadu_si128(
+                        reinterpret_cast<const __m128i*>(&_indexes[dict_index + i]));
+                __m256i gathered = _mm256_i32gather_epi64(
+                        reinterpret_cast<const long long*>(_dict_items.data()), indices, 8);
+                _mm256_storeu_si256(reinterpret_cast<__m256i*>(raw_data + data_index), gathered);
+                data_index += 32; // 4 × 8 bytes
+            }
+            // Scalar tail
+            for (; i < run_length; ++i) {
+                if (use_prefetch && i + PREFETCH_DISTANCE < run_length) {
+                    PREFETCH(&_dict_items[_indexes[dict_index + i + PREFETCH_DISTANCE]]);
+                }
+                *(cppType*)(raw_data + data_index) = _dict_items[_indexes[dict_index + i]];
+                data_index += _type_length;
+            }
+            dict_index += run_length;
+            return;
+        }
+#endif
+        // Scalar fallback with optional prefetch (also covers INT96 etc.)
+        for (size_t i = 0; i < run_length; ++i) {
+            if (use_prefetch && i + PREFETCH_DISTANCE < run_length) {
+                PREFETCH(&_dict_items[_indexes[dict_index + i + PREFETCH_DISTANCE]]);
+            }
+            *(cppType*)(raw_data + data_index) = _dict_items[_indexes[dict_index + i]];
+            data_index += _type_length;
+        }
+        dict_index += run_length;
+    }
+
     Status set_dict(DorisUniqueBufferPtr<uint8_t>& dict, int32_t length,
                     size_t num_values) override {
         if (num_values * _type_length != length) {
@@ -195,6 +375,10 @@ class FixLengthDictDecoder final : public BaseDictDecoder {
             }
             dict_item_address += _type_length;
         }
+        // P1-5: Check if dictionary exceeds L2 cache threshold for prefetch decisions.
+        // Typical L2 cache: 256KB-1MB per core. Use 256KB as conservative threshold.
+        constexpr size_t L2_CACHE_THRESHOLD = 256 * 1024;
+        _dict_exceeds_l2_cache = (num_values * sizeof(cppType)) > L2_CACHE_THRESHOLD;
         return Status::OK();
     }
 
@@ -226,6 +410,8 @@ class FixLengthDictDecoder final : public BaseDictDecoder {
     }
     // For dictionary encoding
     std::vector<typename PhysicalTypeTraits<PhysicalType>::CppType> _dict_items;
+    // P1-5: Whether dictionary size exceeds L2 cache threshold (triggers software prefetching)
+    bool _dict_exceeds_l2_cache = false;
 };
 #include "common/compile_check_end.h"
 
diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
index af01cd090e2334..d1278a252867bd 100644
--- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
@@ -30,8 +30,8 @@ Status FixLengthPlainDecoder::skip_values(size_t num_values) {
 }
 
 Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                                            ColumnSelectVector& select_vector,
-                                            bool is_dict_filter) {
+                                            ColumnSelectVector& select_vector, bool is_dict_filter,
+                                            const uint8_t* filter_data) {
     if (select_vector.has_filter()) {
         return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
     } else {
diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h
index bd0e4e94b14832..364a8f9c63d062 100644
--- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h
+++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h
@@ -37,7 +37,8 @@ class FixLengthPlainDecoder final : public Decoder {
     ~FixLengthPlainDecoder() override = default;
 
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter) override;
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr) override;
 
     template <bool has_filter>
     Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
@@ -47,6 +48,22 @@ class FixLengthPlainDecoder final : public Decoder {
             return Status::IOError("Out-of-bounds access in parquet data decoder");
         }
 
+        // P1-6: Fast path when no nulls and no filter — single memcpy for the entire batch.
+        // This avoids the run loop overhead when the entire batch is one contiguous CONTENT run.
+        if constexpr (!has_filter) {
+            if (select_vector.num_nulls() == 0) {
+                size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory();
+                size_t data_index = doris_column->size() * primitive_length;
+                size_t scale_size = non_null_size * (_type_length / primitive_length);
+                doris_column->resize(doris_column->size() + scale_size);
+                char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
+                size_t total_bytes = non_null_size * _type_length;
+                memcpy(raw_data + data_index, _data->data + _offset, total_bytes);
+                _offset += total_bytes;
+                return Status::OK();
+            }
+        }
+
         size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory();
         size_t data_index = doris_column->size() * primitive_length;
         size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) *
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
index 61568dc4f4c901..0fd2d9a0b9fe93 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
@@ -528,7 +528,7 @@ Status ColumnChunkReader<IN_COLLECTION, OFFSET_INDEX>::skip_values(size_t num_va
 template <bool IN_COLLECTION, bool OFFSET_INDEX>
 Status ColumnChunkReader<IN_COLLECTION, OFFSET_INDEX>::decode_values(
         MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector,
-        bool is_dict_filter) {
+        bool is_dict_filter, const uint8_t* filter_data) {
     if (select_vector.num_values() == 0) {
         return Status::OK();
     }
@@ -540,7 +540,8 @@ Status ColumnChunkReader<IN_COLLECTION, OFFSET_INDEX>::decode_values(
         return Status::IOError("Decode too many values in current page");
     }
     _remaining_num_values -= select_vector.num_values();
-    return _page_decoder->decode_values(doris_column, data_type, select_vector, is_dict_filter);
+    return _page_decoder->decode_values(doris_column, data_type, select_vector, is_dict_filter,
+                                        filter_data);
 }
 
 template <bool IN_COLLECTION, bool OFFSET_INDEX>
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
index d0bf7ab2d81085..eeb26a608cf8b5 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
@@ -132,7 +132,8 @@ class ColumnChunkReader {
 
     // Decode values in current page into doris column.
     Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
-                         ColumnSelectVector& select_vector, bool is_dict_filter);
+                         ColumnSelectVector& select_vector, bool is_dict_filter,
+                         const uint8_t* filter_data = nullptr);
 
     // Get the repetition level decoder of current page.
     LevelDecoder& rep_level_decoder() { return _rep_level_decoder; }
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
index 0f673804260c69..4caa52222f9369 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -25,6 +25,7 @@
 #include <algorithm>
 #include <utility>
 
+#include "common/config.h"
 #include "io/fs/tracing_file_reader.h"
 #include "runtime/define_primitive_type.h"
 #include "schema_desc.h"
@@ -394,7 +395,16 @@ Status ScalarColumnReader<IN_COLLECTION, OFFSET_INDEX>::_read_values(size_t num_
                                            _filter_map_index));
         _filter_map_index += num_values;
     }
-    return _chunk_reader->decode_values(data_column, type, select_vector, is_dict_filter);
+    // When filter_ratio is high (low selectivity), enable lazy dict index decoding
+    // by passing a non-null filter_data signal to the decoder. The decoder will then
+    // skip RLE index decoding for FILTERED_CONTENT runs instead of decoding upfront.
+    const uint8_t* filter_data = nullptr;
+    if (config::enable_parquet_lazy_dict_decode && filter_map.has_filter() &&
+        filter_map.filter_ratio() > 0.95) {
+        filter_data = filter_map.filter_map_data();
+    }
+    return _chunk_reader->decode_values(data_column, type, select_vector, is_dict_filter,
+                                        filter_data);
 }
 
 /**
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index c66eccc642a1ca..3af4ad30544d8a 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -193,158 +193,236 @@ Status RowGroupReader::init(
         std::ranges::sort(_filter_conjuncts, [](const auto& a, const auto& b) {
             return a->execute_cost() < b->execute_cost();
         });
-    }
-    return Status::OK();
-}
 
-bool RowGroupReader::_can_filter_by_dict(int slot_id,
-                                         const tparquet::ColumnMetaData& column_metadata) {
-    SlotDescriptor* slot = nullptr;
-    const std::vector<SlotDescriptor*>& slots = _tuple_descriptor->slots();
-    for (auto each : slots) {
-        if (each->id() == slot_id) {
-            slot = each;
-            break;
+        // P0-3: Confirm lazy dict decode candidates are fully dictionary-encoded in this row group.
+        // Only active when lazy read is enabled and there are candidates.
+        if (_lazy_read_ctx.can_lazy_read && !_lazy_read_ctx.lazy_dict_decode_candidates.empty()) {
+            for (const auto& [col_name, slot_id] : _lazy_read_ctx.lazy_dict_decode_candidates) {
+                auto file_col_name = _table_info_node_ptr->children_file_column_name(col_name);
+                auto* field = schema.get_column(file_col_name);
+                if (field == nullptr) {
+                    continue;
+                }
+                const auto& col_meta =
+                        _row_group_meta.columns[field->physical_column_index].meta_data;
+                if (is_dictionary_encoded(col_meta)) {
+                    _lazy_dict_decode_cols.emplace_back(col_name, slot_id);
+                }
+            }
         }
-    }
-    if (!is_string_type(slot->type()->get_primitive_type()) &&
-        !is_var_len_object(slot->type()->get_primitive_type())) {
-        return false;
-    }
-    if (column_metadata.type != tparquet::Type::BYTE_ARRAY) {
-        return false;
-    }
 
-    if (!is_dictionary_encoded(column_metadata)) {
-        return false;
-    }
+        // P0-2: Initialize per-column predicate read order optimization.
+        // Classify _filter_conjuncts into per-column groups and multi-column group.
+        // Only activate when lazy read is enabled and there are multiple predicate columns.
+        if (config::enable_parquet_predicate_column_reorder && _lazy_read_ctx.can_lazy_read &&
+            _lazy_read_ctx.predicate_columns.first.size() > 1 && !_filter_conjuncts.empty()) {
+            const auto& pred_col_slot_ids = _lazy_read_ctx.predicate_columns.second;
+            // Build slot_id -> predicate column index map
+            std::unordered_map<int, size_t> slot_id_to_pred_idx;
+            for (size_t i = 0; i < pred_col_slot_ids.size(); ++i) {
+                slot_id_to_pred_idx[pred_col_slot_ids[i]] = i;
+            }
 
-    if (_slot_id_to_filter_conjuncts->find(slot_id) == _slot_id_to_filter_conjuncts->end()) {
-        return false;
-    }
+            // Classify each conjunct
+            for (auto& conjunct : _filter_conjuncts) {
+                // Collect all slot_ids referenced by this conjunct
+                std::set<int> referenced_slot_ids;
+                _collect_slot_ids_from_expr(conjunct->root().get(), referenced_slot_ids);
+
+                // Check if all referenced slots belong to a single predicate column
+                size_t matched_pred_idx = std::numeric_limits<size_t>::max();
+                bool is_single_pred_col = true;
+                for (int sid : referenced_slot_ids) {
+                    auto it = slot_id_to_pred_idx.find(sid);
+                    if (it != slot_id_to_pred_idx.end()) {
+                        if (matched_pred_idx == std::numeric_limits<size_t>::max()) {
+                            matched_pred_idx = it->second;
+                        } else if (matched_pred_idx != it->second) {
+                            is_single_pred_col = false;
+                            break;
+                        }
+                    }
+                }
 
-    // TODO: The current implementation of dictionary filtering does not take into account
-    //  the implementation of NULL values because the dictionary itself does not contain
-    //  NULL value encoding. As a result, many NULL-related functions or expressions
-    //  cannot work properly, such as is null, is not null, coalesce, etc.
-    //  Here we check if the predicate expr is IN or BINARY_PRED.
-    //  Implementation of NULL value dictionary filtering will be carried out later.
-    return std::ranges::all_of(_slot_id_to_filter_conjuncts->at(slot_id), [&](const auto& ctx) {
-        return (ctx->root()->node_type() == TExprNodeType::IN_PRED ||
-                ctx->root()->node_type() == TExprNodeType::BINARY_PRED) &&
-               ctx->root()->children()[0]->node_type() == TExprNodeType::SLOT_REF;
-    });
-}
+                if (is_single_pred_col && matched_pred_idx != std::numeric_limits<size_t>::max()) {
+                    _per_col_conjuncts[matched_pred_idx].push_back(conjunct);
+                } else {
+                    _multi_col_conjuncts.push_back(conjunct);
+                }
+            }
 
-// This function is copied from
-// https://github.com/apache/impala/blob/master/be/src/exec/parquet/hdfs-parquet-scanner.cc#L1717
-bool RowGroupReader::is_dictionary_encoded(const tparquet::ColumnMetaData& column_metadata) {
-    // The Parquet spec allows for column chunks to have mixed encodings
-    // where some data pages are dictionary-encoded and others are plain
-    // encoded. For example, a Parquet file writer might start writing
-    // a column chunk as dictionary encoded, but it will switch to plain
-    // encoding if the dictionary grows too large.
-    //
-    // In order for dictionary filters to skip the entire row group,
-    // the conjuncts must be evaluated on column chunks that are entirely
-    // encoded with the dictionary encoding. There are two checks
-    // available to verify this:
-    // 1. The encoding_stats field on the column chunk metadata provides
-    //    information about the number of data pages written in each
-    //    format. This allows for a specific check of whether all the
-    //    data pages are dictionary encoded.
-    // 2. The encodings field on the column chunk metadata lists the
-    //    encodings used. If this list contains the dictionary encoding
-    //    and does not include unexpected encodings (i.e. encodings not
-    //    associated with definition/repetition levels), then it is entirely
-    //    dictionary encoded.
-    if (column_metadata.__isset.encoding_stats) {
-        // Condition #1 above
-        for (const tparquet::PageEncodingStats& enc_stat : column_metadata.encoding_stats) {
-            if (enc_stat.page_type == tparquet::PageType::DATA_PAGE &&
-                (enc_stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY &&
-                 enc_stat.encoding != tparquet::Encoding::RLE_DICTIONARY) &&
-                enc_stat.count > 0) {
-                return false;
+            // Only activate if at least one predicate column has conjuncts
+            bool has_per_col = false;
+            for (auto& [idx, ctxs] : _per_col_conjuncts) {
+                if (!ctxs.empty()) {
+                    has_per_col = true;
+                    break;
+                }
+            }
+
+            if (has_per_col) {
+                _enable_per_column_lazy_read = true;
+                // Initialize ColumnReadOrderCtx with column indices and cost estimates
+                std::vector<size_t> col_indices;
+                std::unordered_map<size_t, size_t> col_cost_map;
+                size_t total_cost = 0;
+                for (size_t i = 0; i < _lazy_read_ctx.predicate_columns.first.size(); ++i) {
+                    col_indices.push_back(i);
+                    // Use a simple cost heuristic: columns with conjuncts get lower cost
+                    // (they should be read first since they filter rows).
+                    // For now, use uniform cost=1 for simplicity; the exploration will find
+                    // the best order based on actual selectivity.
+                    size_t cost = 1;
+                    col_cost_map[i] = cost;
+                    total_cost += cost;
+                }
+                _column_read_order_ctx =
+                        std::make_unique<ColumnReadOrderCtx>(col_indices, col_cost_map, total_cost);
             }
         }
-    } else {
-        // Condition #2 above
-        bool has_dict_encoding = false;
-        bool has_nondict_encoding = false;
-        for (const tparquet::Encoding::type& encoding : column_metadata.encodings) {
-            if (encoding == tparquet::Encoding::PLAIN_DICTIONARY ||
-                encoding == tparquet::Encoding::RLE_DICTIONARY) {
-                has_dict_encoding = true;
-            }
-
-            // RLE and BIT_PACKED are used for repetition/definition levels
-            if (encoding != tparquet::Encoding::PLAIN_DICTIONARY &&
-                encoding != tparquet::Encoding::RLE_DICTIONARY &&
-                encoding != tparquet::Encoding::RLE && encoding != tparquet::Encoding::BIT_PACKED) {
-                has_nondict_encoding = true;
+        return Status::OK();
+    }
+
+    bool RowGroupReader::_can_filter_by_dict(int slot_id,
+                                             const tparquet::ColumnMetaData& column_metadata) {
+        SlotDescriptor* slot = nullptr;
+        const std::vector<SlotDescriptor*>& slots = _tuple_descriptor->slots();
+        for (auto each : slots) {
+            if (each->id() == slot_id) {
+                slot = each;
                 break;
             }
         }
-        // Not entirely dictionary encoded if:
-        // 1. No dictionary encoding listed
-        // OR
-        // 2. Some non-dictionary encoding is listed
-        if (!has_dict_encoding || has_nondict_encoding) {
+        if (!is_string_type(slot->type()->get_primitive_type()) &&
+            !is_var_len_object(slot->type()->get_primitive_type())) {
+            return false;
+        }
+        if (column_metadata.type != tparquet::Type::BYTE_ARRAY) {
             return false;
         }
-    }
 
-    return true;
-}
+        if (!is_dictionary_encoded(column_metadata)) {
+            return false;
+        }
 
-Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_rows,
-                                  bool* batch_eof) {
-    if (_is_row_group_filtered) {
-        *read_rows = 0;
-        *batch_eof = true;
-        return Status::OK();
-    }
+        if (_slot_id_to_filter_conjuncts->find(slot_id) == _slot_id_to_filter_conjuncts->end()) {
+            return false;
+        }
 
-    // Process external table query task that select columns are all from path.
-    if (_read_table_columns.empty()) {
-        bool modify_row_ids = false;
-        RETURN_IF_ERROR(_read_empty_batch(batch_size, read_rows, batch_eof, &modify_row_ids));
+        // TODO: The current implementation of dictionary filtering does not take into account
+        //  the implementation of NULL values because the dictionary itself does not contain
+        //  NULL value encoding. As a result, many NULL-related functions or expressions
+        //  cannot work properly, such as is null, is not null, coalesce, etc.
+        //  Here we check if the predicate expr is IN or BINARY_PRED.
+        //  Implementation of NULL value dictionary filtering will be carried out later.
+        return std::ranges::all_of(_slot_id_to_filter_conjuncts->at(slot_id), [&](const auto& ctx) {
+            return (ctx->root()->node_type() == TExprNodeType::IN_PRED ||
+                    ctx->root()->node_type() == TExprNodeType::BINARY_PRED) &&
+                   ctx->root()->children()[0]->node_type() == TExprNodeType::SLOT_REF;
+        });
+    }
 
-        RETURN_IF_ERROR(
-                _fill_partition_columns(block, *read_rows, _lazy_read_ctx.partition_columns));
-        RETURN_IF_ERROR(_fill_missing_columns(block, *read_rows, _lazy_read_ctx.missing_columns));
+    // This function is copied from
+    // https://github.com/apache/impala/blob/master/be/src/exec/parquet/hdfs-parquet-scanner.cc#L1717
+    bool RowGroupReader::is_dictionary_encoded(const tparquet::ColumnMetaData& column_metadata) {
+        // The Parquet spec allows for column chunks to have mixed encodings
+        // where some data pages are dictionary-encoded and others are plain
+        // encoded. For example, a Parquet file writer might start writing
+        // a column chunk as dictionary encoded, but it will switch to plain
+        // encoding if the dictionary grows too large.
+        //
+        // In order for dictionary filters to skip the entire row group,
+        // the conjuncts must be evaluated on column chunks that are entirely
+        // encoded with the dictionary encoding. There are two checks
+        // available to verify this:
+        // 1. The encoding_stats field on the column chunk metadata provides
+        //    information about the number of data pages written in each
+        //    format. This allows for a specific check of whether all the
+        //    data pages are dictionary encoded.
+        // 2. The encodings field on the column chunk metadata lists the
+        //    encodings used. If this list contains the dictionary encoding
+        //    and does not include unexpected encodings (i.e. encodings not
+        //    associated with definition/repetition levels), then it is entirely
+        //    dictionary encoded.
+        if (column_metadata.__isset.encoding_stats) {
+            // Condition #1 above
+            for (const tparquet::PageEncodingStats& enc_stat : column_metadata.encoding_stats) {
+                if (enc_stat.page_type == tparquet::PageType::DATA_PAGE &&
+                    (enc_stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY &&
+                     enc_stat.encoding != tparquet::Encoding::RLE_DICTIONARY) &&
+                    enc_stat.count > 0) {
+                    return false;
+                }
+            }
+        } else {
+            // Condition #2 above
+            bool has_dict_encoding = false;
+            bool has_nondict_encoding = false;
+            for (const tparquet::Encoding::type& encoding : column_metadata.encodings) {
+                if (encoding == tparquet::Encoding::PLAIN_DICTIONARY ||
+                    encoding == tparquet::Encoding::RLE_DICTIONARY) {
+                    has_dict_encoding = true;
+                }
 
-        RETURN_IF_ERROR(_fill_row_id_columns(block, *read_rows, modify_row_ids));
+                // RLE and BIT_PACKED are used for repetition/definition levels
+                if (encoding != tparquet::Encoding::PLAIN_DICTIONARY &&
+                    encoding != tparquet::Encoding::RLE_DICTIONARY &&
+                    encoding != tparquet::Encoding::RLE &&
+                    encoding != tparquet::Encoding::BIT_PACKED) {
+                    has_nondict_encoding = true;
+                    break;
+                }
+            }
+            // Not entirely dictionary encoded if:
+            // 1. No dictionary encoding listed
+            // OR
+            // 2. Some non-dictionary encoding is listed
+            if (!has_dict_encoding || has_nondict_encoding) {
+                return false;
+            }
+        }
 
-        Status st = VExprContext::filter_block(_lazy_read_ctx.conjuncts, block, block->columns());
-        *read_rows = block->rows();
-        return st;
+        return true;
     }
-    if (_lazy_read_ctx.can_lazy_read) {
-        // call _do_lazy_read recursively when current batch is skipped
-        return _do_lazy_read(block, batch_size, read_rows, batch_eof);
-    } else {
-        FilterMap filter_map;
-        RETURN_IF_ERROR((_read_column_data(block, _lazy_read_ctx.all_read_columns, batch_size,
-                                           read_rows, batch_eof, filter_map)));
-        RETURN_IF_ERROR(
-                _fill_partition_columns(block, *read_rows, _lazy_read_ctx.partition_columns));
-        RETURN_IF_ERROR(_fill_missing_columns(block, *read_rows, _lazy_read_ctx.missing_columns));
-        RETURN_IF_ERROR(_fill_row_id_columns(block, *read_rows, false));
 
-#ifndef NDEBUG
-        for (auto col : *block) {
-            col.column->sanity_check();
-            DCHECK(block->rows() == col.column->size())
-                    << absl::Substitute("block rows = $0 , column rows = $1, col name = $2",
-                                        block->rows(), col.column->size(), col.name);
+    Status RowGroupReader::next_batch(Block * block, size_t batch_size, size_t * read_rows,
+                                      bool* batch_eof) {
+        if (_is_row_group_filtered) {
+            *read_rows = 0;
+            *batch_eof = true;
+            return Status::OK();
         }
-#endif
 
-        if (block->rows() == 0) {
-            _convert_dict_cols_to_string_cols(block);
+        // Process external table query task that select columns are all from path.
+        if (_read_table_columns.empty()) {
+            bool modify_row_ids = false;
+            RETURN_IF_ERROR(_read_empty_batch(batch_size, read_rows, batch_eof, &modify_row_ids));
+
+            RETURN_IF_ERROR(
+                    _fill_partition_columns(block, *read_rows, _lazy_read_ctx.partition_columns));
+            RETURN_IF_ERROR(
+                    _fill_missing_columns(block, *read_rows, _lazy_read_ctx.missing_columns));
+
+            RETURN_IF_ERROR(_fill_row_id_columns(block, *read_rows, modify_row_ids));
+
+            Status st =
+                    VExprContext::filter_block(_lazy_read_ctx.conjuncts, block, block->columns());
             *read_rows = block->rows();
+            return st;
+        }
+        if (_lazy_read_ctx.can_lazy_read) {
+            // call _do_lazy_read recursively when current batch is skipped
+            return _do_lazy_read(block, batch_size, read_rows, batch_eof);
+        } else {
+            FilterMap filter_map;
+            RETURN_IF_ERROR((_read_column_data(block, _lazy_read_ctx.all_read_columns, batch_size,
+                                               read_rows, batch_eof, filter_map)));
+            RETURN_IF_ERROR(
+                    _fill_partition_columns(block, *read_rows, _lazy_read_ctx.partition_columns));
+            RETURN_IF_ERROR(
+                    _fill_missing_columns(block, *read_rows, _lazy_read_ctx.missing_columns));
+            RETURN_IF_ERROR(_fill_row_id_columns(block, *read_rows, false));
+
 #ifndef NDEBUG
             for (auto col : *block) {
                 col.column->sanity_check();
@@ -353,836 +431,1334 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_
                                             block->rows(), col.column->size(), col.name);
             }
 #endif
-            return Status::OK();
-        }
-        {
-            SCOPED_RAW_TIMER(&_predicate_filter_time);
-            RETURN_IF_ERROR(_build_pos_delete_filter(*read_rows));
 
-            std::vector<uint32_t> columns_to_filter;
-            int column_to_keep = block->columns();
-            columns_to_filter.resize(column_to_keep);
-            for (uint32_t i = 0; i < column_to_keep; ++i) {
-                columns_to_filter[i] = i;
-            }
-            if (!_lazy_read_ctx.conjuncts.empty()) {
-                std::vector<IColumn::Filter*> filters;
-                if (_position_delete_ctx.has_filter) {
-                    filters.push_back(_pos_delete_filter_ptr.get());
+            if (block->rows() == 0) {
+                _convert_dict_cols_to_string_cols(block);
+                *read_rows = block->rows();
+#ifndef NDEBUG
+                for (auto col : *block) {
+                    col.column->sanity_check();
+                    DCHECK(block->rows() == col.column->size())
+                            << absl::Substitute("block rows = $0 , column rows = $1, col name = $2",
+                                                block->rows(), col.column->size(), col.name);
                 }
-                IColumn::Filter result_filter(block->rows(), 1);
-                bool can_filter_all = false;
+#endif
+                return Status::OK();
+            }
+            {
+                SCOPED_RAW_TIMER(&_predicate_filter_time);
+                RETURN_IF_ERROR(_build_pos_delete_filter(*read_rows));
 
-                {
-                    RETURN_IF_ERROR_OR_CATCH_EXCEPTION(VExprContext::execute_conjuncts(
-                            _filter_conjuncts, &filters, block, &result_filter, &can_filter_all));
+                std::vector<uint32_t> columns_to_filter;
+                int column_to_keep = block->columns();
+                columns_to_filter.resize(column_to_keep);
+                for (uint32_t i = 0; i < column_to_keep; ++i) {
+                    columns_to_filter[i] = i;
                 }
+                if (!_lazy_read_ctx.conjuncts.empty()) {
+                    std::vector<IColumn::Filter*> filters;
+                    if (_position_delete_ctx.has_filter) {
+                        filters.push_back(_pos_delete_filter_ptr.get());
+                    }
+                    IColumn::Filter result_filter(block->rows(), 1);
+                    bool can_filter_all = false;
+
+                    {
+                        RETURN_IF_ERROR_OR_CATCH_EXCEPTION(
+                                VExprContext::execute_conjuncts(_filter_conjuncts, &filters, block,
+                                                                &result_filter, &can_filter_all));
+                    }
 
-                if (can_filter_all) {
-                    for (auto& col : columns_to_filter) {
-                        std::move(*block->get_by_position(col).column).assume_mutable()->clear();
+                    if (can_filter_all) {
+                        for (auto& col : columns_to_filter) {
+                            std::move(*block->get_by_position(col).column)
+                                    .assume_mutable()
+                                    ->clear();
+                        }
+                        Block::erase_useless_column(block, column_to_keep);
+                        _convert_dict_cols_to_string_cols(block);
+                        return Status::OK();
                     }
+
+                    RETURN_IF_CATCH_EXCEPTION(
+                            Block::filter_block_internal(block, columns_to_filter, result_filter));
                     Block::erase_useless_column(block, column_to_keep);
-                    _convert_dict_cols_to_string_cols(block);
-                    return Status::OK();
+                } else {
+                    RETURN_IF_CATCH_EXCEPTION(RETURN_IF_ERROR(
+                            _filter_block(block, column_to_keep, columns_to_filter)));
                 }
-
-                RETURN_IF_CATCH_EXCEPTION(
-                        Block::filter_block_internal(block, columns_to_filter, result_filter));
-                Block::erase_useless_column(block, column_to_keep);
-            } else {
-                RETURN_IF_CATCH_EXCEPTION(
-                        RETURN_IF_ERROR(_filter_block(block, column_to_keep, columns_to_filter)));
+                _convert_dict_cols_to_string_cols(block);
             }
-            _convert_dict_cols_to_string_cols(block);
-        }
 #ifndef NDEBUG
-        for (auto col : *block) {
-            col.column->sanity_check();
-            DCHECK(block->rows() == col.column->size())
-                    << absl::Substitute("block rows = $0 , column rows = $1, col name = $2",
-                                        block->rows(), col.column->size(), col.name);
-        }
+            for (auto col : *block) {
+                col.column->sanity_check();
+                DCHECK(block->rows() == col.column->size())
+                        << absl::Substitute("block rows = $0 , column rows = $1, col name = $2",
+                                            block->rows(), col.column->size(), col.name);
+            }
 #endif
-        *read_rows = block->rows();
-        return Status::OK();
+            *read_rows = block->rows();
+            return Status::OK();
+        }
     }
-}
 
-Status RowGroupReader::_read_column_data(Block* block,
-                                         const std::vector<std::string>& table_columns,
-                                         size_t batch_size, size_t* read_rows, bool* batch_eof,
-                                         FilterMap& filter_map) {
-    size_t batch_read_rows = 0;
-    bool has_eof = false;
-    for (auto& read_col_name : table_columns) {
-        auto& column_with_type_and_name =
-                block->safe_get_by_position((*_col_name_to_block_idx)[read_col_name]);
-        auto& column_ptr = column_with_type_and_name.column;
-        auto& column_type = column_with_type_and_name.type;
-        bool is_dict_filter = false;
-        for (auto& _dict_filter_col : _dict_filter_cols) {
-            if (_dict_filter_col.first == read_col_name) {
-                MutableColumnPtr dict_column = ColumnInt32::create();
-                if (!_col_name_to_block_idx->contains(read_col_name)) {
-                    return Status::InternalError(
-                            "Wrong read column '{}' in parquet file, block: {}", read_col_name,
-                            block->dump_structure());
+    Status RowGroupReader::_read_column_data(
+            Block * block, const std::vector<std::string>& table_columns, size_t batch_size,
+            size_t* read_rows, bool* batch_eof, FilterMap& filter_map) {
+        size_t batch_read_rows = 0;
+        bool has_eof = false;
+        for (auto& read_col_name : table_columns) {
+            auto& column_with_type_and_name =
+                    block->safe_get_by_position((*_col_name_to_block_idx)[read_col_name]);
+            auto& column_ptr = column_with_type_and_name.column;
+            auto& column_type = column_with_type_and_name.type;
+            bool is_dict_filter = false;
+            for (auto& _dict_filter_col : _dict_filter_cols) {
+                if (_dict_filter_col.first == read_col_name) {
+                    MutableColumnPtr dict_column = ColumnInt32::create();
+                    if (!_col_name_to_block_idx->contains(read_col_name)) {
+                        return Status::InternalError(
+                                "Wrong read column '{}' in parquet file, block: {}", read_col_name,
+                                block->dump_structure());
+                    }
+                    if (column_type->is_nullable()) {
+                        block->get_by_position((*_col_name_to_block_idx)[read_col_name]).type =
+                                std::make_shared<DataTypeNullable>(
+                                        std::make_shared<DataTypeInt32>());
+                        block->replace_by_position(
+                                (*_col_name_to_block_idx)[read_col_name],
+                                ColumnNullable::create(
+                                        std::move(dict_column),
+                                        ColumnUInt8::create(dict_column->size(), 0)));
+                    } else {
+                        block->get_by_position((*_col_name_to_block_idx)[read_col_name]).type =
+                                std::make_shared<DataTypeInt32>();
+                        block->replace_by_position((*_col_name_to_block_idx)[read_col_name],
+                                                   std::move(dict_column));
+                    }
+                    is_dict_filter = true;
+                    break;
                 }
-                if (column_type->is_nullable()) {
-                    block->get_by_position((*_col_name_to_block_idx)[read_col_name]).type =
-                            std::make_shared<DataTypeNullable>(std::make_shared<DataTypeInt32>());
-                    block->replace_by_position(
-                            (*_col_name_to_block_idx)[read_col_name],
-                            ColumnNullable::create(std::move(dict_column),
-                                                   ColumnUInt8::create(dict_column->size(), 0)));
-                } else {
-                    block->get_by_position((*_col_name_to_block_idx)[read_col_name]).type =
-                            std::make_shared<DataTypeInt32>();
-                    block->replace_by_position((*_col_name_to_block_idx)[read_col_name],
-                                               std::move(dict_column));
+            }
+            // P0-3: Also check lazy dict decode columns. These are lazy string columns
+            // confirmed as fully dict-encoded; we read them as int32 dict codes and
+            // convert back to strings after filtering.
+            if (!is_dict_filter) {
+                for (auto& lazy_dict_col : _lazy_dict_decode_cols) {
+                    if (lazy_dict_col.first == read_col_name) {
+                        MutableColumnPtr dict_column = ColumnInt32::create();
+                        if (column_type->is_nullable()) {
+                            block->get_by_position((*_col_name_to_block_idx)[read_col_name]).type =
+                                    std::make_shared<DataTypeNullable>(
+                                            std::make_shared<DataTypeInt32>());
+                            block->replace_by_position(
+                                    (*_col_name_to_block_idx)[read_col_name],
+                                    ColumnNullable::create(
+                                            std::move(dict_column),
+                                            ColumnUInt8::create(dict_column->size(), 0)));
+                        } else {
+                            block->get_by_position((*_col_name_to_block_idx)[read_col_name]).type =
+                                    std::make_shared<DataTypeInt32>();
+                            block->replace_by_position((*_col_name_to_block_idx)[read_col_name],
+                                                       std::move(dict_column));
+                        }
+                        is_dict_filter = true;
+                        break;
+                    }
                 }
-                is_dict_filter = true;
-                break;
             }
-        }
 
-        size_t col_read_rows = 0;
-        bool col_eof = false;
-        // Should reset _filter_map_index to 0 when reading next column.
-        //        select_vector.reset();
-        _column_readers[read_col_name]->reset_filter_map_index();
-        while (!col_eof && col_read_rows < batch_size) {
-            size_t loop_rows = 0;
-            RETURN_IF_ERROR(_column_readers[read_col_name]->read_column_data(
-                    column_ptr, column_type, _table_info_node_ptr->get_children_node(read_col_name),
-                    filter_map, batch_size - col_read_rows, &loop_rows, &col_eof, is_dict_filter));
+            size_t col_read_rows = 0;
+            bool col_eof = false;
+            // Should reset _filter_map_index to 0 when reading next column.
+            //        select_vector.reset();
+            _column_readers[read_col_name]->reset_filter_map_index();
+            while (!col_eof && col_read_rows < batch_size) {
+                size_t loop_rows = 0;
+                RETURN_IF_ERROR(_column_readers[read_col_name]->read_column_data(
+                        column_ptr, column_type,
+                        _table_info_node_ptr->get_children_node(read_col_name), filter_map,
+                        batch_size - col_read_rows, &loop_rows, &col_eof, is_dict_filter));
+                VLOG_DEBUG << "[RowGroupReader] column '" << read_col_name
+                           << "' loop_rows=" << loop_rows
+                           << " col_read_rows_so_far=" << col_read_rows << std::endl;
+                col_read_rows += loop_rows;
+            }
             VLOG_DEBUG << "[RowGroupReader] column '" << read_col_name
-                       << "' loop_rows=" << loop_rows << " col_read_rows_so_far=" << col_read_rows
-                       << std::endl;
-            col_read_rows += loop_rows;
-        }
-        VLOG_DEBUG << "[RowGroupReader] column '" << read_col_name
-                   << "' read_rows=" << col_read_rows << std::endl;
-        if (batch_read_rows > 0 && batch_read_rows != col_read_rows) {
-            LOG(WARNING) << "[RowGroupReader] Mismatched read rows among parquet columns. "
-                            "previous_batch_read_rows="
-                         << batch_read_rows << ", current_column='" << read_col_name
-                         << "', current_col_read_rows=" << col_read_rows;
-            return Status::Corruption("Can't read the same number of rows among parquet columns");
-        }
-        batch_read_rows = col_read_rows;
+                       << "' read_rows=" << col_read_rows << std::endl;
+            if (batch_read_rows > 0 && batch_read_rows != col_read_rows) {
+                LOG(WARNING) << "[RowGroupReader] Mismatched read rows among parquet columns. "
+                                "previous_batch_read_rows="
+                             << batch_read_rows << ", current_column='" << read_col_name
+                             << "', current_col_read_rows=" << col_read_rows;
+                return Status::Corruption(
+                        "Can't read the same number of rows among parquet columns");
+            }
+            batch_read_rows = col_read_rows;
 
 #ifndef NDEBUG
-        column_ptr->sanity_check();
+            column_ptr->sanity_check();
 #endif
-        if (col_eof) {
-            has_eof = true;
+            if (col_eof) {
+                has_eof = true;
+            }
         }
-    }
 
-    *read_rows = batch_read_rows;
-    *batch_eof = has_eof;
+        *read_rows = batch_read_rows;
+        *batch_eof = has_eof;
 
-    return Status::OK();
-}
-
-Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* read_rows,
-                                     bool* batch_eof) {
-    std::unique_ptr<FilterMap> filter_map_ptr = nullptr;
-    size_t pre_read_rows;
-    bool pre_eof;
-    std::vector<uint32_t> columns_to_filter;
-    uint32_t origin_column_num = block->columns();
-    columns_to_filter.resize(origin_column_num);
-    for (uint32_t i = 0; i < origin_column_num; ++i) {
-        columns_to_filter[i] = i;
+        return Status::OK();
     }
-    IColumn::Filter result_filter;
-    size_t pre_raw_read_rows = 0;
-    while (!_state->is_cancelled()) {
-        // read predicate columns
-        pre_read_rows = 0;
-        pre_eof = false;
-        FilterMap filter_map;
-        RETURN_IF_ERROR(_read_column_data(block, _lazy_read_ctx.predicate_columns.first, batch_size,
-                                          &pre_read_rows, &pre_eof, filter_map));
-        if (pre_read_rows == 0) {
-            DCHECK_EQ(pre_eof, true);
-            break;
-        }
-        pre_raw_read_rows += pre_read_rows;
-        RETURN_IF_ERROR(_fill_partition_columns(block, pre_read_rows,
-                                                _lazy_read_ctx.predicate_partition_columns));
-        RETURN_IF_ERROR(_fill_missing_columns(block, pre_read_rows,
-                                              _lazy_read_ctx.predicate_missing_columns));
-        RETURN_IF_ERROR(_fill_row_id_columns(block, pre_read_rows, false));
-
-        RETURN_IF_ERROR(_build_pos_delete_filter(pre_read_rows));
 
-#ifndef NDEBUG
-        for (auto col : *block) {
-            if (col.column->size() == 0) { // lazy read column.
-                continue;
-            }
-            col.column->sanity_check();
-            DCHECK(pre_read_rows == col.column->size())
-                    << absl::Substitute("pre_read_rows = $0 , column rows = $1, col name = $2",
-                                        pre_read_rows, col.column->size(), col.name);
+    Status RowGroupReader::_do_lazy_read(Block * block, size_t batch_size, size_t * read_rows,
+                                         bool* batch_eof) {
+        // Dispatch to per-column lazy read when enabled (P0-2 optimization)
+        if (_enable_per_column_lazy_read) {
+            return _do_lazy_read_per_column(block, batch_size, read_rows, batch_eof);
         }
-#endif
-
-        bool can_filter_all = false;
-        {
-            SCOPED_RAW_TIMER(&_predicate_filter_time);
-
-            // generate filter vector
-            if (_lazy_read_ctx.resize_first_column) {
-                // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0
-                // The following process may be tricky and time-consuming, but we have no other way.
-                block->get_by_position(0).column->assume_mutable()->resize(pre_read_rows);
-            }
-            result_filter.assign(pre_read_rows, static_cast<unsigned char>(1));
-            std::vector<IColumn::Filter*> filters;
-            if (_position_delete_ctx.has_filter) {
-                filters.push_back(_pos_delete_filter_ptr.get());
-            }
 
-            VExprContextSPtrs filter_contexts;
-            for (auto& conjunct : _filter_conjuncts) {
-                filter_contexts.emplace_back(conjunct);
+        std::unique_ptr<FilterMap> filter_map_ptr = nullptr;
+        size_t pre_read_rows;
+        bool pre_eof;
+        std::vector<uint32_t> columns_to_filter;
+        uint32_t origin_column_num = block->columns();
+        columns_to_filter.resize(origin_column_num);
+        for (uint32_t i = 0; i < origin_column_num; ++i) {
+            columns_to_filter[i] = i;
+        }
+        IColumn::Filter result_filter;
+        size_t pre_raw_read_rows = 0;
+        while (!_state->is_cancelled()) {
+            // read predicate columns
+            pre_read_rows = 0;
+            pre_eof = false;
+            FilterMap filter_map;
+            RETURN_IF_ERROR(_read_column_data(block, _lazy_read_ctx.predicate_columns.first,
+                                              batch_size, &pre_read_rows, &pre_eof, filter_map));
+            if (pre_read_rows == 0) {
+                DCHECK_EQ(pre_eof, true);
+                break;
             }
+            pre_raw_read_rows += pre_read_rows;
+            RETURN_IF_ERROR(_fill_partition_columns(block, pre_read_rows,
+                                                    _lazy_read_ctx.predicate_partition_columns));
+            RETURN_IF_ERROR(_fill_missing_columns(block, pre_read_rows,
+                                                  _lazy_read_ctx.predicate_missing_columns));
+            RETURN_IF_ERROR(_fill_row_id_columns(block, pre_read_rows, false));
 
-            {
-                RETURN_IF_ERROR(VExprContext::execute_conjuncts(filter_contexts, &filters, block,
-                                                                &result_filter, &can_filter_all));
-            }
+            RETURN_IF_ERROR(_build_pos_delete_filter(pre_read_rows));
 
-            if (_lazy_read_ctx.resize_first_column) {
-                // We have to clean the first column to insert right data.
-                block->get_by_position(0).column->assume_mutable()->clear();
+#ifndef NDEBUG
+            for (auto col : *block) {
+                if (col.column->size() == 0) { // lazy read column.
+                    continue;
+                }
+                col.column->sanity_check();
+                DCHECK(pre_read_rows == col.column->size())
+                        << absl::Substitute("pre_read_rows = $0 , column rows = $1, col name = $2",
+                                            pre_read_rows, col.column->size(), col.name);
             }
-        }
+#endif
 
-        const uint8_t* __restrict filter_map_data = result_filter.data();
-        filter_map_ptr = std::make_unique<FilterMap>();
-        RETURN_IF_ERROR(filter_map_ptr->init(filter_map_data, pre_read_rows, can_filter_all));
-        if (filter_map_ptr->filter_all()) {
+            bool can_filter_all = false;
             {
                 SCOPED_RAW_TIMER(&_predicate_filter_time);
-                for (const auto& col : _lazy_read_ctx.predicate_columns.first) {
-                    // clean block to read predicate columns
-                    block->get_by_position((*_col_name_to_block_idx)[col])
-                            .column->assume_mutable()
-                            ->clear();
+
+                // generate filter vector
+                if (_lazy_read_ctx.resize_first_column) {
+                    // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0
+                    // The following process may be tricky and time-consuming, but we have no other way.
+                    block->get_by_position(0).column->assume_mutable()->resize(pre_read_rows);
+                }
+                result_filter.assign(pre_read_rows, static_cast<unsigned char>(1));
+                std::vector<IColumn::Filter*> filters;
+                if (_position_delete_ctx.has_filter) {
+                    filters.push_back(_pos_delete_filter_ptr.get());
                 }
-                for (const auto& col : _lazy_read_ctx.predicate_partition_columns) {
-                    block->get_by_position((*_col_name_to_block_idx)[col.first])
-                            .column->assume_mutable()
-                            ->clear();
+
+                VExprContextSPtrs filter_contexts;
+                for (auto& conjunct : _filter_conjuncts) {
+                    filter_contexts.emplace_back(conjunct);
                 }
-                for (const auto& col : _lazy_read_ctx.predicate_missing_columns) {
-                    block->get_by_position((*_col_name_to_block_idx)[col.first])
-                            .column->assume_mutable()
-                            ->clear();
+
+                {
+                    RETURN_IF_ERROR(VExprContext::execute_conjuncts(
+                            filter_contexts, &filters, block, &result_filter, &can_filter_all));
                 }
-                if (_row_id_column_iterator_pair.first != nullptr) {
-                    block->get_by_position(_row_id_column_iterator_pair.second)
-                            .column->assume_mutable()
-                            ->clear();
+
+                if (_lazy_read_ctx.resize_first_column) {
+                    // We have to clean the first column to insert right data.
+                    block->get_by_position(0).column->assume_mutable()->clear();
                 }
-                Block::erase_useless_column(block, origin_column_num);
             }
 
-            if (!pre_eof) {
-                // If continuous batches are skipped, we can cache them to skip a whole page
-                _cached_filtered_rows += pre_read_rows;
-                if (pre_raw_read_rows >= config::doris_scanner_row_num) {
+            const uint8_t* __restrict filter_map_data = result_filter.data();
+            filter_map_ptr = std::make_unique<FilterMap>();
+            RETURN_IF_ERROR(filter_map_ptr->init(filter_map_data, pre_read_rows, can_filter_all));
+            if (filter_map_ptr->filter_all()) {
+                {
+                    SCOPED_RAW_TIMER(&_predicate_filter_time);
+                    for (const auto& col : _lazy_read_ctx.predicate_columns.first) {
+                        // clean block to read predicate columns
+                        block->get_by_position((*_col_name_to_block_idx)[col])
+                                .column->assume_mutable()
+                                ->clear();
+                    }
+                    for (const auto& col : _lazy_read_ctx.predicate_partition_columns) {
+                        block->get_by_position((*_col_name_to_block_idx)[col.first])
+                                .column->assume_mutable()
+                                ->clear();
+                    }
+                    for (const auto& col : _lazy_read_ctx.predicate_missing_columns) {
+                        block->get_by_position((*_col_name_to_block_idx)[col.first])
+                                .column->assume_mutable()
+                                ->clear();
+                    }
+                    if (_row_id_column_iterator_pair.first != nullptr) {
+                        block->get_by_position(_row_id_column_iterator_pair.second)
+                                .column->assume_mutable()
+                                ->clear();
+                    }
+                    Block::erase_useless_column(block, origin_column_num);
+                }
+
+                if (!pre_eof) {
+                    // If continuous batches are skipped, we can cache them to skip a whole page
+                    _cached_filtered_rows += pre_read_rows;
+                    if (pre_raw_read_rows >= config::doris_scanner_row_num) {
+                        *read_rows = 0;
+                        _convert_dict_cols_to_string_cols(block);
+                        return Status::OK();
+                    }
+                } else { // pre_eof
+                    // If filter_map_ptr->filter_all() and pre_eof, we can skip whole row group.
                     *read_rows = 0;
+                    *batch_eof = true;
+                    _lazy_read_filtered_rows += (pre_read_rows + _cached_filtered_rows);
                     _convert_dict_cols_to_string_cols(block);
                     return Status::OK();
                 }
-            } else { // pre_eof
-                // If filter_map_ptr->filter_all() and pre_eof, we can skip whole row group.
-                *read_rows = 0;
-                *batch_eof = true;
-                _lazy_read_filtered_rows += (pre_read_rows + _cached_filtered_rows);
-                _convert_dict_cols_to_string_cols(block);
-                return Status::OK();
+            } else {
+                break;
             }
-        } else {
-            break;
         }
-    }
-    if (_state->is_cancelled()) {
-        return Status::Cancelled("cancelled");
-    }
+        if (_state->is_cancelled()) {
+            return Status::Cancelled("cancelled");
+        }
 
-    if (filter_map_ptr == nullptr) {
-        DCHECK_EQ(pre_read_rows + _cached_filtered_rows, 0);
-        *read_rows = 0;
-        *batch_eof = true;
-        return Status::OK();
-    }
+        if (filter_map_ptr == nullptr) {
+            DCHECK_EQ(pre_read_rows + _cached_filtered_rows, 0);
+            *read_rows = 0;
+            *batch_eof = true;
+            return Status::OK();
+        }
 
-    FilterMap& filter_map = *filter_map_ptr;
-    DorisUniqueBufferPtr<uint8_t> rebuild_filter_map = nullptr;
-    if (_cached_filtered_rows != 0) {
-        RETURN_IF_ERROR(_rebuild_filter_map(filter_map, rebuild_filter_map, pre_read_rows));
-        pre_read_rows += _cached_filtered_rows;
-        _cached_filtered_rows = 0;
-    }
+        FilterMap& filter_map = *filter_map_ptr;
+        DorisUniqueBufferPtr<uint8_t> rebuild_filter_map = nullptr;
+        if (_cached_filtered_rows != 0) {
+            RETURN_IF_ERROR(_rebuild_filter_map(filter_map, rebuild_filter_map, pre_read_rows));
+            pre_read_rows += _cached_filtered_rows;
+            _cached_filtered_rows = 0;
+        }
 
-    // lazy read columns
-    size_t lazy_read_rows;
-    bool lazy_eof;
-    RETURN_IF_ERROR(_read_column_data(block, _lazy_read_ctx.lazy_read_columns, pre_read_rows,
-                                      &lazy_read_rows, &lazy_eof, filter_map));
+        // lazy read columns
+        size_t lazy_read_rows;
+        bool lazy_eof;
+        RETURN_IF_ERROR(_read_column_data(block, _lazy_read_ctx.lazy_read_columns, pre_read_rows,
+                                          &lazy_read_rows, &lazy_eof, filter_map));
 
-    if (pre_read_rows != lazy_read_rows) {
-        return Status::Corruption("Can't read the same number of rows when doing lazy read");
-    }
-    // pre_eof ^ lazy_eof
-    // we set pre_read_rows as batch_size for lazy read columns, so pre_eof != lazy_eof
+        if (pre_read_rows != lazy_read_rows) {
+            return Status::Corruption("Can't read the same number of rows when doing lazy read");
+        }
+        // pre_eof ^ lazy_eof
+        // we set pre_read_rows as batch_size for lazy read columns, so pre_eof != lazy_eof
 
-    // filter data in predicate columns, and remove filter column
-    {
-        SCOPED_RAW_TIMER(&_predicate_filter_time);
-        if (filter_map.has_filter()) {
-            RETURN_IF_CATCH_EXCEPTION(Block::filter_block_internal(
-                    block, _lazy_read_ctx.all_predicate_col_ids, result_filter));
-            Block::erase_useless_column(block, origin_column_num);
+        // filter data in predicate columns, and remove filter column
+        {
+            SCOPED_RAW_TIMER(&_predicate_filter_time);
+            if (filter_map.has_filter()) {
+                std::vector<uint32_t> predicate_columns = _lazy_read_ctx.all_predicate_col_ids;
+                if (_iceberg_rowid_params.enabled) {
+                    int row_id_idx =
+                            block->get_position_by_name(doris::BeConsts::ICEBERG_ROWID_COL);
+                    if (row_id_idx >= 0 &&
+                        std::find(predicate_columns.begin(), predicate_columns.end(),
+                                  static_cast<uint32_t>(row_id_idx)) == predicate_columns.end()) {
+                        predicate_columns.push_back(static_cast<uint32_t>(row_id_idx));
+                    }
+                }
+                RETURN_IF_CATCH_EXCEPTION(
+                        Block::filter_block_internal(block, predicate_columns, result_filter));
+                Block::erase_useless_column(block, origin_column_num);
 
-        } else {
-            Block::erase_useless_column(block, origin_column_num);
+            } else {
+                Block::erase_useless_column(block, origin_column_num);
+            }
         }
-    }
 
-    _convert_dict_cols_to_string_cols(block);
+        _convert_dict_cols_to_string_cols(block);
+        _convert_lazy_dict_cols_to_string_cols(block);
 
-    size_t column_num = block->columns();
-    size_t column_size = 0;
-    for (int i = 0; i < column_num; ++i) {
-        size_t cz = block->get_by_position(i).column->size();
-        if (column_size != 0 && cz != 0) {
-            DCHECK_EQ(column_size, cz);
-        }
-        if (cz != 0) {
-            column_size = cz;
+        size_t column_num = block->columns();
+        size_t column_size = 0;
+        for (int i = 0; i < column_num; ++i) {
+            size_t cz = block->get_by_position(i).column->size();
+            if (column_size != 0 && cz != 0) {
+                DCHECK_EQ(column_size, cz);
+            }
+            if (cz != 0) {
+                column_size = cz;
+            }
         }
-    }
-    _lazy_read_filtered_rows += pre_read_rows - column_size;
-    *read_rows = column_size;
+        _lazy_read_filtered_rows += pre_read_rows - column_size;
+        *read_rows = column_size;
 
-    *batch_eof = pre_eof;
-    RETURN_IF_ERROR(_fill_partition_columns(block, column_size, _lazy_read_ctx.partition_columns));
-    RETURN_IF_ERROR(_fill_missing_columns(block, column_size, _lazy_read_ctx.missing_columns));
+        *batch_eof = pre_eof;
+        RETURN_IF_ERROR(
+                _fill_partition_columns(block, column_size, _lazy_read_ctx.partition_columns));
+        RETURN_IF_ERROR(_fill_missing_columns(block, column_size, _lazy_read_ctx.missing_columns));
 #ifndef NDEBUG
-    for (auto col : *block) {
-        col.column->sanity_check();
-        DCHECK(block->rows() == col.column->size())
-                << absl::Substitute("block rows = $0 , column rows = $1, col name = $2",
-                                    block->rows(), col.column->size(), col.name);
-    }
+        for (auto col : *block) {
+            col.column->sanity_check();
+            DCHECK(block->rows() == col.column->size())
+                    << absl::Substitute("block rows = $0 , column rows = $1, col name = $2",
+                                        block->rows(), col.column->size(), col.name);
+        }
 #endif
-    return Status::OK();
-}
-
-Status RowGroupReader::_rebuild_filter_map(FilterMap& filter_map,
-                                           DorisUniqueBufferPtr<uint8_t>& filter_map_data,
-                                           size_t pre_read_rows) const {
-    if (_cached_filtered_rows == 0) {
-        return Status::OK();
-    }
-    size_t total_rows = _cached_filtered_rows + pre_read_rows;
-    if (filter_map.filter_all()) {
-        RETURN_IF_ERROR(filter_map.init(nullptr, total_rows, true));
         return Status::OK();
     }
 
-    filter_map_data = make_unique_buffer<uint8_t>(total_rows);
-    auto* map = filter_map_data.get();
-    for (size_t i = 0; i < _cached_filtered_rows; ++i) {
-        map[i] = 0;
-    }
-    const uint8_t* old_map = filter_map.filter_map_data();
-    if (old_map == nullptr) {
-        // select_vector.filter_all() == true is already built.
-        for (size_t i = _cached_filtered_rows; i < total_rows; ++i) {
-            map[i] = 1;
+    Status RowGroupReader::_rebuild_filter_map(FilterMap & filter_map,
+                                               DorisUniqueBufferPtr<uint8_t> & filter_map_data,
+                                               size_t pre_read_rows) const {
+        if (_cached_filtered_rows == 0) {
+            return Status::OK();
+        }
+        size_t total_rows = _cached_filtered_rows + pre_read_rows;
+        if (filter_map.filter_all()) {
+            RETURN_IF_ERROR(filter_map.init(nullptr, total_rows, true));
+            return Status::OK();
         }
-    } else {
-        memcpy(map + _cached_filtered_rows, old_map, pre_read_rows);
-    }
-    RETURN_IF_ERROR(filter_map.init(map, total_rows, false));
-    return Status::OK();
-}
 
-Status RowGroupReader::_fill_partition_columns(
-        Block* block, size_t rows,
-        const std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>&
-                partition_columns) {
-    DataTypeSerDe::FormatOptions _text_formatOptions;
-    for (const auto& kv : partition_columns) {
-        auto doris_column = block->get_by_position((*_col_name_to_block_idx)[kv.first]).column;
-        // obtained from block*, it is a mutable object.
-        auto* col_ptr = const_cast<IColumn*>(doris_column.get());
-        const auto& [value, slot_desc] = kv.second;
-        auto _text_serde = slot_desc->get_data_type_ptr()->get_serde();
-        Slice slice(value.data(), value.size());
-        uint64_t num_deserialized = 0;
-        // Be careful when reading empty rows from parquet row groups.
-        if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows,
-                                                            &num_deserialized,
-                                                            _text_formatOptions) != Status::OK()) {
-            return Status::InternalError("Failed to fill partition column: {}={}",
-                                         slot_desc->col_name(), value);
+        filter_map_data = make_unique_buffer<uint8_t>(total_rows);
+        auto* map = filter_map_data.get();
+        for (size_t i = 0; i < _cached_filtered_rows; ++i) {
+            map[i] = 0;
         }
-        if (num_deserialized != rows) {
-            return Status::InternalError(
-                    "Failed to fill partition column: {}={} ."
-                    "Number of rows expected to be written : {}, number of rows actually written : "
-                    "{}",
-                    slot_desc->col_name(), value, num_deserialized, rows);
+        const uint8_t* old_map = filter_map.filter_map_data();
+        if (old_map == nullptr) {
+            // select_vector.filter_all() == true is already built.
+            for (size_t i = _cached_filtered_rows; i < total_rows; ++i) {
+                map[i] = 1;
+            }
+        } else {
+            memcpy(map + _cached_filtered_rows, old_map, pre_read_rows);
         }
+        RETURN_IF_ERROR(filter_map.init(map, total_rows, false));
+        return Status::OK();
     }
-    return Status::OK();
-}
 
-Status RowGroupReader::_fill_missing_columns(
-        Block* block, size_t rows,
-        const std::unordered_map<std::string, VExprContextSPtr>& missing_columns) {
-    for (const auto& kv : missing_columns) {
-        if (!_col_name_to_block_idx->contains(kv.first)) {
-            return Status::InternalError("Missing column: {} not found in block {}", kv.first,
-                                         block->dump_structure());
-        }
-        if (kv.second == nullptr) {
-            // no default column, fill with null
-            auto mutable_column = block->get_by_position((*_col_name_to_block_idx)[kv.first])
-                                          .column->assume_mutable();
-            auto* nullable_column = assert_cast<vectorized::ColumnNullable*>(mutable_column.get());
-            nullable_column->insert_many_defaults(rows);
-        } else {
-            // fill with default value
-            const auto& ctx = kv.second;
-            ColumnPtr result_column_ptr;
-            // PT1 => dest primitive type
-            RETURN_IF_ERROR(ctx->execute(block, result_column_ptr));
-            if (result_column_ptr->use_count() == 1) {
-                // call resize because the first column of _src_block_ptr may not be filled by reader,
-                // so _src_block_ptr->rows() may return wrong result, cause the column created by `ctx->execute()`
-                // has only one row.
-                auto mutable_column = result_column_ptr->assume_mutable();
-                mutable_column->resize(rows);
-                // result_column_ptr maybe a ColumnConst, convert it to a normal column
-                result_column_ptr = result_column_ptr->convert_to_full_column_if_const();
-                auto origin_column_type =
-                        block->get_by_position((*_col_name_to_block_idx)[kv.first]).type;
-                bool is_nullable = origin_column_type->is_nullable();
-                block->replace_by_position(
-                        (*_col_name_to_block_idx)[kv.first],
-                        is_nullable ? make_nullable(result_column_ptr) : result_column_ptr);
+    Status RowGroupReader::_fill_partition_columns(
+            Block * block, size_t rows,
+            const std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>&
+                    partition_columns) {
+        DataTypeSerDe::FormatOptions _text_formatOptions;
+        for (const auto& kv : partition_columns) {
+            auto doris_column = block->get_by_position((*_col_name_to_block_idx)[kv.first]).column;
+            // obtained from block*, it is a mutable object.
+            auto* col_ptr = const_cast<IColumn*>(doris_column.get());
+            const auto& [value, slot_desc] = kv.second;
+            auto _text_serde = slot_desc->get_data_type_ptr()->get_serde();
+            Slice slice(value.data(), value.size());
+            uint64_t num_deserialized = 0;
+            // Be careful when reading empty rows from parquet row groups.
+            if (_text_serde->deserialize_column_from_fixed_json(
+                        *col_ptr, slice, rows, &num_deserialized, _text_formatOptions) !=
+                Status::OK()) {
+                return Status::InternalError("Failed to fill partition column: {}={}",
+                                             slot_desc->col_name(), value);
+            }
+            if (num_deserialized != rows) {
+                return Status::InternalError(
+                        "Failed to fill partition column: {}={} ."
+                        "Number of rows expected to be written : {}, number of rows actually "
+                        "written : "
+                        "{}",
+                        slot_desc->col_name(), value, num_deserialized, rows);
             }
         }
+        return Status::OK();
     }
-    return Status::OK();
-}
 
-Status RowGroupReader::_read_empty_batch(size_t batch_size, size_t* read_rows, bool* batch_eof,
-                                         bool* modify_row_ids) {
-    *modify_row_ids = false;
-    if (_position_delete_ctx.has_filter) {
-        int64_t start_row_id = _position_delete_ctx.current_row_id;
-        int64_t end_row_id = std::min(_position_delete_ctx.current_row_id + (int64_t)batch_size,
-                                      _position_delete_ctx.last_row_id);
-        int64_t num_delete_rows = 0;
-        auto before_index = _position_delete_ctx.index;
-        while (_position_delete_ctx.index < _position_delete_ctx.end_index) {
-            const int64_t& delete_row_id =
-                    _position_delete_ctx.delete_rows[_position_delete_ctx.index];
-            if (delete_row_id < start_row_id) {
-                _position_delete_ctx.index++;
-                before_index = _position_delete_ctx.index;
-            } else if (delete_row_id < end_row_id) {
-                num_delete_rows++;
-                _position_delete_ctx.index++;
-            } else { // delete_row_id >= end_row_id
-                break;
+    Status RowGroupReader::_fill_missing_columns(
+            Block * block, size_t rows,
+            const std::unordered_map<std::string, VExprContextSPtr>& missing_columns) {
+        for (const auto& kv : missing_columns) {
+            if (!_col_name_to_block_idx->contains(kv.first)) {
+                return Status::InternalError("Missing column: {} not found in block {}", kv.first,
+                                             block->dump_structure());
+            }
+            if (kv.second == nullptr) {
+                // no default column, fill with null
+                auto mutable_column = block->get_by_position((*_col_name_to_block_idx)[kv.first])
+                                              .column->assume_mutable();
+                auto* nullable_column =
+                        assert_cast<vectorized::ColumnNullable*>(mutable_column.get());
+                nullable_column->insert_many_defaults(rows);
+            } else {
+                // fill with default value
+                const auto& ctx = kv.second;
+                ColumnPtr result_column_ptr;
+                // PT1 => dest primitive type
+                RETURN_IF_ERROR(ctx->execute(block, result_column_ptr));
+                if (result_column_ptr->use_count() == 1) {
+                    // call resize because the first column of _src_block_ptr may not be filled by reader,
+                    // so _src_block_ptr->rows() may return wrong result, cause the column created by `ctx->execute()`
+                    // has only one row.
+                    auto mutable_column = result_column_ptr->assume_mutable();
+                    mutable_column->resize(rows);
+                    // result_column_ptr maybe a ColumnConst, convert it to a normal column
+                    result_column_ptr = result_column_ptr->convert_to_full_column_if_const();
+                    auto origin_column_type =
+                            block->get_by_position((*_col_name_to_block_idx)[kv.first]).type;
+                    bool is_nullable = origin_column_type->is_nullable();
+                    block->replace_by_position(
+                            (*_col_name_to_block_idx)[kv.first],
+                            is_nullable ? make_nullable(result_column_ptr) : result_column_ptr);
+                }
             }
         }
-        *read_rows = end_row_id - start_row_id - num_delete_rows;
-        _position_delete_ctx.current_row_id = end_row_id;
-        *batch_eof = _position_delete_ctx.current_row_id == _position_delete_ctx.last_row_id;
+        return Status::OK();
+    }
 
-        if (_row_id_column_iterator_pair.first != nullptr) {
-            *modify_row_ids = true;
-            _current_batch_row_ids.clear();
-            _current_batch_row_ids.resize(*read_rows);
-            size_t idx = 0;
-            for (auto id = start_row_id; id < end_row_id; id++) {
-                if (before_index < _position_delete_ctx.index &&
-                    id == _position_delete_ctx.delete_rows[before_index]) {
-                    before_index++;
-                    continue;
+    Status RowGroupReader::_read_empty_batch(size_t batch_size, size_t * read_rows, bool* batch_eof,
+                                             bool* modify_row_ids) {
+        *modify_row_ids = false;
+        if (_position_delete_ctx.has_filter) {
+            int64_t start_row_id = _position_delete_ctx.current_row_id;
+            int64_t end_row_id = std::min(_position_delete_ctx.current_row_id + (int64_t)batch_size,
+                                          _position_delete_ctx.last_row_id);
+            int64_t num_delete_rows = 0;
+            auto before_index = _position_delete_ctx.index;
+            while (_position_delete_ctx.index < _position_delete_ctx.end_index) {
+                const int64_t& delete_row_id =
+                        _position_delete_ctx.delete_rows[_position_delete_ctx.index];
+                if (delete_row_id < start_row_id) {
+                    _position_delete_ctx.index++;
+                    before_index = _position_delete_ctx.index;
+                } else if (delete_row_id < end_row_id) {
+                    num_delete_rows++;
+                    _position_delete_ctx.index++;
+                } else { // delete_row_id >= end_row_id
+                    break;
+                }
+            }
+            *read_rows = end_row_id - start_row_id - num_delete_rows;
+            _position_delete_ctx.current_row_id = end_row_id;
+            *batch_eof = _position_delete_ctx.current_row_id == _position_delete_ctx.last_row_id;
+
+            if (_row_id_column_iterator_pair.first != nullptr) {
+                *modify_row_ids = true;
+                _current_batch_row_ids.clear();
+                _current_batch_row_ids.resize(*read_rows);
+                size_t idx = 0;
+                for (auto id = start_row_id; id < end_row_id; id++) {
+                    if (before_index < _position_delete_ctx.index &&
+                        id == _position_delete_ctx.delete_rows[before_index]) {
+                        before_index++;
+                        continue;
+                    }
+                    _current_batch_row_ids[idx++] = (rowid_t)id;
                 }
-                _current_batch_row_ids[idx++] = (rowid_t)id;
             }
-        }
-    } else {
-        if (batch_size < _remaining_rows) {
-            *read_rows = batch_size;
-            _remaining_rows -= batch_size;
-            *batch_eof = false;
         } else {
-            *read_rows = _remaining_rows;
-            _remaining_rows = 0;
-            *batch_eof = true;
+            if (batch_size < _remaining_rows) {
+                *read_rows = batch_size;
+                _remaining_rows -= batch_size;
+                *batch_eof = false;
+            } else {
+                *read_rows = _remaining_rows;
+                _remaining_rows = 0;
+                *batch_eof = true;
+            }
         }
+        _total_read_rows += *read_rows;
+        return Status::OK();
     }
-    _total_read_rows += *read_rows;
-    return Status::OK();
-}
 
-Status RowGroupReader::_get_current_batch_row_id(size_t read_rows) {
-    _current_batch_row_ids.clear();
-    _current_batch_row_ids.resize(read_rows);
+    Status RowGroupReader::_get_current_batch_row_id(size_t read_rows) {
+        _current_batch_row_ids.clear();
+        _current_batch_row_ids.resize(read_rows);
 
-    int64_t idx = 0;
-    int64_t read_range_rows = 0;
-    for (size_t range_idx = 0; range_idx < _read_ranges.range_size(); range_idx++) {
-        auto range = _read_ranges.get_range(range_idx);
-        if (read_rows == 0) {
-            break;
-        }
-        if (read_range_rows + (range.to() - range.from()) > _total_read_rows) {
-            int64_t fi =
-                    std::max(_total_read_rows, read_range_rows) - read_range_rows + range.from();
-            size_t len = std::min(read_rows, (size_t)(std::max(range.to(), fi) - fi));
+        int64_t idx = 0;
+        int64_t read_range_rows = 0;
+        for (size_t range_idx = 0; range_idx < _read_ranges.range_size(); range_idx++) {
+            auto range = _read_ranges.get_range(range_idx);
+            if (read_rows == 0) {
+                break;
+            }
+            if (read_range_rows + (range.to() - range.from()) > _total_read_rows) {
+                int64_t fi = std::max(_total_read_rows, read_range_rows) - read_range_rows +
+                             range.from();
+                size_t len = std::min(read_rows, (size_t)(std::max(range.to(), fi) - fi));
 
-            read_rows -= len;
+                read_rows -= len;
 
-            for (auto i = 0; i < len; i++) {
-                _current_batch_row_ids[idx++] =
-                        (rowid_t)(fi + i + _current_row_group_idx.first_row);
+                for (auto i = 0; i < len; i++) {
+                    _current_batch_row_ids[idx++] =
+                            (rowid_t)(fi + i + _current_row_group_idx.first_row);
+                }
             }
+            read_range_rows += range.to() - range.from();
         }
-        read_range_rows += range.to() - range.from();
+        return Status::OK();
     }
-    return Status::OK();
-}
 
-Status RowGroupReader::_fill_row_id_columns(Block* block, size_t read_rows,
-                                            bool is_current_row_ids) {
-    if (_row_id_column_iterator_pair.first != nullptr) {
-        if (!is_current_row_ids) {
-            RETURN_IF_ERROR(_get_current_batch_row_id(read_rows));
+    Status RowGroupReader::_fill_row_id_columns(Block * block, size_t read_rows,
+                                                bool is_current_row_ids) {
+        if (_row_id_column_iterator_pair.first != nullptr) {
+            if (!is_current_row_ids) {
+                RETURN_IF_ERROR(_get_current_batch_row_id(read_rows));
+            }
+            auto col = block->get_by_position(_row_id_column_iterator_pair.second)
+                               .column->assume_mutable();
+            RETURN_IF_ERROR(_row_id_column_iterator_pair.first->read_by_rowids(
+                    _current_batch_row_ids.data(), _current_batch_row_ids.size(), col));
         }
-        auto col = block->get_by_position(_row_id_column_iterator_pair.second)
-                           .column->assume_mutable();
-        RETURN_IF_ERROR(_row_id_column_iterator_pair.first->read_by_rowids(
-                _current_batch_row_ids.data(), _current_batch_row_ids.size(), col));
-    }
 
-    return Status::OK();
-}
-
-Status RowGroupReader::_build_pos_delete_filter(size_t read_rows) {
-    if (!_position_delete_ctx.has_filter) {
-        _pos_delete_filter_ptr.reset(nullptr);
-        _total_read_rows += read_rows;
         return Status::OK();
     }
-    _pos_delete_filter_ptr.reset(new IColumn::Filter(read_rows, 1));
-    auto* __restrict _pos_delete_filter_data = _pos_delete_filter_ptr->data();
-    while (_position_delete_ctx.index < _position_delete_ctx.end_index) {
-        const int64_t delete_row_index_in_row_group =
-                _position_delete_ctx.delete_rows[_position_delete_ctx.index] -
-                _position_delete_ctx.first_row_id;
-        int64_t read_range_rows = 0;
-        size_t remaining_read_rows = _total_read_rows + read_rows;
-        for (size_t range_idx = 0; range_idx < _read_ranges.range_size(); range_idx++) {
-            auto range = _read_ranges.get_range(range_idx);
-            if (delete_row_index_in_row_group < range.from()) {
-                ++_position_delete_ctx.index;
-                break;
-            } else if (delete_row_index_in_row_group < range.to()) {
-                int64_t index = (delete_row_index_in_row_group - range.from()) + read_range_rows -
-                                _total_read_rows;
-                if (index > read_rows - 1) {
+
+    Status RowGroupReader::_build_pos_delete_filter(size_t read_rows) {
+        if (!_position_delete_ctx.has_filter) {
+            _pos_delete_filter_ptr.reset(nullptr);
+            _total_read_rows += read_rows;
+            return Status::OK();
+        }
+        _pos_delete_filter_ptr.reset(new IColumn::Filter(read_rows, 1));
+        auto* __restrict _pos_delete_filter_data = _pos_delete_filter_ptr->data();
+        while (_position_delete_ctx.index < _position_delete_ctx.end_index) {
+            const int64_t delete_row_index_in_row_group =
+                    _position_delete_ctx.delete_rows[_position_delete_ctx.index] -
+                    _position_delete_ctx.first_row_id;
+            int64_t read_range_rows = 0;
+            size_t remaining_read_rows = _total_read_rows + read_rows;
+            for (size_t range_idx = 0; range_idx < _read_ranges.range_size(); range_idx++) {
+                auto range = _read_ranges.get_range(range_idx);
+                if (delete_row_index_in_row_group < range.from()) {
+                    ++_position_delete_ctx.index;
+                    break;
+                } else if (delete_row_index_in_row_group < range.to()) {
+                    int64_t index = (delete_row_index_in_row_group - range.from()) +
+                                    read_range_rows - _total_read_rows;
+                    if (index > read_rows - 1) {
+                        _total_read_rows += read_rows;
+                        return Status::OK();
+                    }
+                    _pos_delete_filter_data[index] = 0;
+                    ++_position_delete_ctx.index;
+                    break;
+                } else { // delete_row >= range.last_row
+                }
+
+                int64_t range_size = range.to() - range.from();
+                // Don't search next range when there is no remaining_read_rows.
+                if (remaining_read_rows <= range_size) {
                     _total_read_rows += read_rows;
                     return Status::OK();
+                } else {
+                    remaining_read_rows -= range_size;
+                    read_range_rows += range_size;
                 }
-                _pos_delete_filter_data[index] = 0;
-                ++_position_delete_ctx.index;
-                break;
-            } else { // delete_row >= range.last_row
-            }
-
-            int64_t range_size = range.to() - range.from();
-            // Don't search next range when there is no remaining_read_rows.
-            if (remaining_read_rows <= range_size) {
-                _total_read_rows += read_rows;
-                return Status::OK();
-            } else {
-                remaining_read_rows -= range_size;
-                read_range_rows += range_size;
             }
         }
+        _total_read_rows += read_rows;
+        return Status::OK();
     }
-    _total_read_rows += read_rows;
-    return Status::OK();
-}
 
-// need exception safety
-Status RowGroupReader::_filter_block(Block* block, int column_to_keep,
-                                     const std::vector<uint32_t>& columns_to_filter) {
-    if (_pos_delete_filter_ptr) {
-        RETURN_IF_CATCH_EXCEPTION(
-                Block::filter_block_internal(block, columns_to_filter, (*_pos_delete_filter_ptr)));
-    }
-    Block::erase_useless_column(block, column_to_keep);
+    // need exception safety
+    Status RowGroupReader::_filter_block(Block * block, int column_to_keep,
+                                         const std::vector<uint32_t>& columns_to_filter) {
+        if (_pos_delete_filter_ptr) {
+            RETURN_IF_CATCH_EXCEPTION(Block::filter_block_internal(block, columns_to_filter,
+                                                                   (*_pos_delete_filter_ptr)));
+        }
+        Block::erase_useless_column(block, column_to_keep);
 
-    return Status::OK();
-}
+        return Status::OK();
+    }
 
-Status RowGroupReader::_rewrite_dict_predicates() {
-    SCOPED_RAW_TIMER(&_dict_filter_rewrite_time);
-    for (auto it = _dict_filter_cols.begin(); it != _dict_filter_cols.end();) {
-        std::string& dict_filter_col_name = it->first;
-        int slot_id = it->second;
-        // 1. Get dictionary values to a string column.
-        MutableColumnPtr dict_value_column = ColumnString::create();
-        bool has_dict = false;
-        RETURN_IF_ERROR(_column_readers[dict_filter_col_name]->read_dict_values_to_column(
-                dict_value_column, &has_dict));
+    Status RowGroupReader::_rewrite_dict_predicates() {
+        SCOPED_RAW_TIMER(&_dict_filter_rewrite_time);
+        for (auto it = _dict_filter_cols.begin(); it != _dict_filter_cols.end();) {
+            std::string& dict_filter_col_name = it->first;
+            int slot_id = it->second;
+            // 1. Get dictionary values to a string column.
+            MutableColumnPtr dict_value_column = ColumnString::create();
+            bool has_dict = false;
+            RETURN_IF_ERROR(_column_readers[dict_filter_col_name]->read_dict_values_to_column(
+                    dict_value_column, &has_dict));
 #ifndef NDEBUG
-        dict_value_column->sanity_check();
+            dict_value_column->sanity_check();
 #endif
-        size_t dict_value_column_size = dict_value_column->size();
-        DCHECK(has_dict);
-        // 2. Build a temp block from the dict string column, then execute conjuncts and filter block.
-        // 2.1 Build a temp block from the dict string column to match the conjuncts executing.
-        Block temp_block;
-        int dict_pos = -1;
-        int index = 0;
-        for (const auto slot_desc : _tuple_descriptor->slots()) {
-            if (slot_desc->id() == slot_id) {
-                auto data_type = slot_desc->get_data_type_ptr();
-                if (data_type->is_nullable()) {
-                    temp_block.insert(
-                            {ColumnNullable::create(
-                                     std::move(
-                                             dict_value_column), // NOLINT(bugprone-use-after-move)
-                                     ColumnUInt8::create(dict_value_column_size, 0)),
-                             std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>()),
-                             ""});
+            size_t dict_value_column_size = dict_value_column->size();
+            DCHECK(has_dict);
+            // 2. Build a temp block from the dict string column, then execute conjuncts and filter block.
+            // 2.1 Build a temp block from the dict string column to match the conjuncts executing.
+            Block temp_block;
+            int dict_pos = -1;
+            int index = 0;
+            for (const auto slot_desc : _tuple_descriptor->slots()) {
+                if (slot_desc->id() == slot_id) {
+                    auto data_type = slot_desc->get_data_type_ptr();
+                    if (data_type->is_nullable()) {
+                        temp_block.insert(
+                                {ColumnNullable::create(
+                                         std::move(
+                                                 dict_value_column), // NOLINT(bugprone-use-after-move)
+                                         ColumnUInt8::create(dict_value_column_size, 0)),
+                                 std::make_shared<DataTypeNullable>(
+                                         std::make_shared<DataTypeString>()),
+                                 ""});
+                    } else {
+                        temp_block.insert({std::move(dict_value_column),
+                                           std::make_shared<DataTypeString>(), ""});
+                    }
+                    dict_pos = index;
+
                 } else {
-                    temp_block.insert(
-                            {std::move(dict_value_column), std::make_shared<DataTypeString>(), ""});
+                    temp_block.insert(ColumnWithTypeAndName(slot_desc->get_empty_mutable_column(),
+                                                            slot_desc->get_data_type_ptr(),
+                                                            slot_desc->col_name()));
                 }
-                dict_pos = index;
+                ++index;
+            }
 
+            // 2.2 Execute conjuncts.
+            VExprContextSPtrs ctxs;
+            auto iter = _slot_id_to_filter_conjuncts->find(slot_id);
+            if (iter != _slot_id_to_filter_conjuncts->end()) {
+                for (auto& ctx : iter->second) {
+                    ctxs.push_back(ctx);
+                }
             } else {
-                temp_block.insert(ColumnWithTypeAndName(slot_desc->get_empty_mutable_column(),
-                                                        slot_desc->get_data_type_ptr(),
-                                                        slot_desc->col_name()));
+                std::stringstream msg;
+                msg << "_slot_id_to_filter_conjuncts: slot_id [" << slot_id << "] not found";
+                return Status::NotFound(msg.str());
+            }
+
+            if (dict_pos != 0) {
+                // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0
+                // The following process may be tricky and time-consuming, but we have no other way.
+                temp_block.get_by_position(0).column->assume_mutable()->resize(
+                        dict_value_column_size);
+            }
+            IColumn::Filter result_filter(temp_block.rows(), 1);
+            bool can_filter_all;
+            {
+                RETURN_IF_ERROR(VExprContext::execute_conjuncts(ctxs, nullptr, &temp_block,
+                                                                &result_filter, &can_filter_all));
+            }
+            if (dict_pos != 0) {
+                // We have to clean the first column to insert right data.
+                temp_block.get_by_position(0).column->assume_mutable()->clear();
+            }
+
+            // If can_filter_all = true, can filter this row group.
+            if (can_filter_all) {
+                _is_row_group_filtered = true;
+                return Status::OK();
             }
-            ++index;
+
+            // 3. Get dict codes.
+            std::vector<int32_t> dict_codes;
+            for (size_t i = 0; i < result_filter.size(); ++i) {
+                if (result_filter[i]) {
+                    dict_codes.emplace_back(i);
+                }
+            }
+
+            // About Performance: if dict_column size is too large, it will generate a large IN filter.
+            if (dict_codes.size() > MAX_DICT_CODE_PREDICATE_TO_REWRITE) {
+                it = _dict_filter_cols.erase(it);
+                for (auto& ctx : ctxs) {
+                    _filter_conjuncts.push_back(ctx);
+                }
+                continue;
+            }
+
+            // 4. Rewrite conjuncts.
+            RETURN_IF_ERROR(_rewrite_dict_conjuncts(
+                    dict_codes, slot_id,
+                    temp_block.get_by_position(dict_pos).column->is_nullable()));
+            ++it;
         }
+        return Status::OK();
+    }
 
-        // 2.2 Execute conjuncts.
-        VExprContextSPtrs ctxs;
-        auto iter = _slot_id_to_filter_conjuncts->find(slot_id);
-        if (iter != _slot_id_to_filter_conjuncts->end()) {
-            for (auto& ctx : iter->second) {
-                ctxs.push_back(ctx);
+    Status RowGroupReader::_rewrite_dict_conjuncts(std::vector<int32_t> & dict_codes, int slot_id,
+                                                   bool is_nullable) {
+        VExprSPtr root;
+        if (dict_codes.size() == 1) {
+            {
+                TFunction fn;
+                TFunctionName fn_name;
+                fn_name.__set_db_name("");
+                fn_name.__set_function_name("eq");
+                fn.__set_name(fn_name);
+                fn.__set_binary_type(TFunctionBinaryType::BUILTIN);
+                std::vector<TTypeDesc> arg_types;
+                arg_types.push_back(create_type_desc(PrimitiveType::TYPE_INT));
+                arg_types.push_back(create_type_desc(PrimitiveType::TYPE_INT));
+                fn.__set_arg_types(arg_types);
+                fn.__set_ret_type(create_type_desc(PrimitiveType::TYPE_BOOLEAN));
+                fn.__set_has_var_args(false);
+
+                TExprNode texpr_node;
+                texpr_node.__set_type(create_type_desc(PrimitiveType::TYPE_BOOLEAN));
+                texpr_node.__set_node_type(TExprNodeType::BINARY_PRED);
+                texpr_node.__set_opcode(TExprOpcode::EQ);
+                texpr_node.__set_fn(fn);
+                texpr_node.__set_num_children(2);
+                texpr_node.__set_is_nullable(is_nullable);
+                root = VectorizedFnCall::create_shared(texpr_node);
+            }
+            {
+                SlotDescriptor* slot = nullptr;
+                const std::vector<SlotDescriptor*>& slots = _tuple_descriptor->slots();
+                for (auto each : slots) {
+                    if (each->id() == slot_id) {
+                        slot = each;
+                        break;
+                    }
+                }
+                root->add_child(VSlotRef::create_shared(slot));
+            }
+            {
+                TExprNode texpr_node;
+                texpr_node.__set_node_type(TExprNodeType::INT_LITERAL);
+                texpr_node.__set_type(create_type_desc(TYPE_INT));
+                TIntLiteral int_literal;
+                int_literal.__set_value(dict_codes[0]);
+                texpr_node.__set_int_literal(int_literal);
+                texpr_node.__set_is_nullable(is_nullable);
+                root->add_child(VLiteral::create_shared(texpr_node));
             }
         } else {
-            std::stringstream msg;
-            msg << "_slot_id_to_filter_conjuncts: slot_id [" << slot_id << "] not found";
-            return Status::NotFound(msg.str());
+            {
+                TTypeDesc type_desc = create_type_desc(PrimitiveType::TYPE_BOOLEAN);
+                TExprNode node;
+                node.__set_type(type_desc);
+                node.__set_node_type(TExprNodeType::IN_PRED);
+                node.in_predicate.__set_is_not_in(false);
+                node.__set_opcode(TExprOpcode::FILTER_IN);
+                // VdirectInPredicate assume is_nullable = false.
+                node.__set_is_nullable(false);
+
+                std::shared_ptr<HybridSetBase> hybrid_set(
+                        create_set(PrimitiveType::TYPE_INT, dict_codes.size(), false));
+                for (int j = 0; j < dict_codes.size(); ++j) {
+                    hybrid_set->insert(&dict_codes[j]);
+                }
+                root = vectorized::VDirectInPredicate::create_shared(node, hybrid_set);
+            }
+            {
+                SlotDescriptor* slot = nullptr;
+                const std::vector<SlotDescriptor*>& slots = _tuple_descriptor->slots();
+                for (auto each : slots) {
+                    if (each->id() == slot_id) {
+                        slot = each;
+                        break;
+                    }
+                }
+                root->add_child(VSlotRef::create_shared(slot));
+            }
         }
+        VExprContextSPtr rewritten_conjunct_ctx = VExprContext::create_shared(root);
+        RETURN_IF_ERROR(rewritten_conjunct_ctx->prepare(_state, *_row_descriptor));
+        RETURN_IF_ERROR(rewritten_conjunct_ctx->open(_state));
+        _dict_filter_conjuncts.push_back(rewritten_conjunct_ctx);
+        _filter_conjuncts.push_back(rewritten_conjunct_ctx);
+        return Status::OK();
+    }
 
-        if (dict_pos != 0) {
-            // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0
-            // The following process may be tricky and time-consuming, but we have no other way.
-            temp_block.get_by_position(0).column->assume_mutable()->resize(dict_value_column_size);
+    void RowGroupReader::_convert_dict_cols_to_string_cols(Block * block) {
+        for (auto& dict_filter_cols : _dict_filter_cols) {
+            if (!_col_name_to_block_idx->contains(dict_filter_cols.first)) {
+                throw Exception(ErrorCode::INTERNAL_ERROR,
+                                "Wrong read column '{}' in parquet file, block: {}",
+                                dict_filter_cols.first, block->dump_structure());
+            }
+            ColumnWithTypeAndName& column_with_type_and_name =
+                    block->get_by_position((*_col_name_to_block_idx)[dict_filter_cols.first]);
+            const ColumnPtr& column = column_with_type_and_name.column;
+            if (const auto* nullable_column = check_and_get_column<ColumnNullable>(*column)) {
+                const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr();
+                const auto* dict_column = assert_cast<const ColumnInt32*>(nested_column.get());
+                DCHECK(dict_column);
+
+                MutableColumnPtr string_column =
+                        _column_readers[dict_filter_cols.first]
+                                ->convert_dict_column_to_string_column(dict_column);
+
+                column_with_type_and_name.type =
+                        std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
+                block->replace_by_position(
+                        (*_col_name_to_block_idx)[dict_filter_cols.first],
+                        ColumnNullable::create(std::move(string_column),
+                                               nullable_column->get_null_map_column_ptr()));
+            } else {
+                const auto* dict_column = assert_cast<const ColumnInt32*>(column.get());
+                MutableColumnPtr string_column =
+                        _column_readers[dict_filter_cols.first]
+                                ->convert_dict_column_to_string_column(dict_column);
+
+                column_with_type_and_name.type = std::make_shared<DataTypeString>();
+                block->replace_by_position((*_col_name_to_block_idx)[dict_filter_cols.first],
+                                           std::move(string_column));
+            }
         }
-        IColumn::Filter result_filter(temp_block.rows(), 1);
-        bool can_filter_all;
-        {
-            RETURN_IF_ERROR(VExprContext::execute_conjuncts(ctxs, nullptr, &temp_block,
-                                                            &result_filter, &can_filter_all));
+    }
+
+    void RowGroupReader::_convert_lazy_dict_cols_to_string_cols(Block * block) {
+        for (auto& lazy_dict_col : _lazy_dict_decode_cols) {
+            if (!_col_name_to_block_idx->contains(lazy_dict_col.first)) {
+                // Column may not be present if block was cleared (filter_all path).
+                continue;
+            }
+            ColumnWithTypeAndName& column_with_type_and_name =
+                    block->get_by_position((*_col_name_to_block_idx)[lazy_dict_col.first]);
+            const ColumnPtr& column = column_with_type_and_name.column;
+            // If column is empty (e.g., cleared during filter_all), skip conversion.
+            if (column->size() == 0) {
+                // Still need to restore the type to string for consistency.
+                if (column_with_type_and_name.type->is_nullable()) {
+                    column_with_type_and_name.type =
+                            std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
+                    block->replace_by_position(
+                            (*_col_name_to_block_idx)[lazy_dict_col.first],
+                            ColumnNullable::create(ColumnString::create(), ColumnUInt8::create()));
+                } else {
+                    column_with_type_and_name.type = std::make_shared<DataTypeString>();
+                    block->replace_by_position((*_col_name_to_block_idx)[lazy_dict_col.first],
+                                               ColumnString::create());
+                }
+                continue;
+            }
+            if (const auto* nullable_column = check_and_get_column<ColumnNullable>(*column)) {
+                const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr();
+                const auto* dict_column = assert_cast<const ColumnInt32*>(nested_column.get());
+                DCHECK(dict_column);
+
+                MutableColumnPtr string_column =
+                        _column_readers[lazy_dict_col.first]->convert_dict_column_to_string_column(
+                                dict_column);
+
+                column_with_type_and_name.type =
+                        std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
+                block->replace_by_position(
+                        (*_col_name_to_block_idx)[lazy_dict_col.first],
+                        ColumnNullable::create(std::move(string_column),
+                                               nullable_column->get_null_map_column_ptr()));
+            } else {
+                const auto* dict_column = assert_cast<const ColumnInt32*>(column.get());
+                MutableColumnPtr string_column =
+                        _column_readers[lazy_dict_col.first]->convert_dict_column_to_string_column(
+                                dict_column);
+
+                column_with_type_and_name.type = std::make_shared<DataTypeString>();
+                block->replace_by_position((*_col_name_to_block_idx)[lazy_dict_col.first],
+                                           std::move(string_column));
+            }
+        }
+    }
+
+    void RowGroupReader::_collect_slot_ids_from_expr(const VExpr* expr, std::set<int>& slot_ids) {
+        if (expr->is_slot_ref()) {
+            const auto* slot_ref = static_cast<const VSlotRef*>(expr);
+            slot_ids.insert(slot_ref->slot_id());
         }
-        if (dict_pos != 0) {
-            // We have to clean the first column to insert right data.
-            temp_block.get_by_position(0).column->assume_mutable()->clear();
+        for (auto& child : expr->children()) {
+            _collect_slot_ids_from_expr(child.get(), slot_ids);
         }
+    }
 
-        // If can_filter_all = true, can filter this row group.
-        if (can_filter_all) {
-            _is_row_group_filtered = true;
-            return Status::OK();
+    Status RowGroupReader::_do_lazy_read_per_column(Block * block, size_t batch_size,
+                                                    size_t * read_rows, bool* batch_eof) {
+        // This method implements per-column predicate reading with intermediate filtering.
+        // Instead of reading all predicate columns at once, it reads them one by one,
+        // evaluating per-column conjuncts after each column. This allows highly-selective
+        // columns to reduce the number of rows decoded for subsequent columns.
+        //
+        // The overall structure mirrors _do_lazy_read(), but Phase 1 is changed from
+        // "read all predicate columns" to "read one column at a time + intermediate filter".
+
+        std::unique_ptr<FilterMap> filter_map_ptr = nullptr;
+        size_t pre_read_rows;
+        bool pre_eof;
+        std::vector<uint32_t> columns_to_filter;
+        uint32_t origin_column_num = block->columns();
+        columns_to_filter.resize(origin_column_num);
+        for (uint32_t i = 0; i < origin_column_num; ++i) {
+            columns_to_filter[i] = i;
         }
+        IColumn::Filter result_filter;
+        size_t pre_raw_read_rows = 0;
+
+        const auto& pred_col_names = _lazy_read_ctx.predicate_columns.first;
+
+        while (!_state->is_cancelled()) {
+            pre_read_rows = 0;
+            pre_eof = false;
+
+            // Phase 1: Read predicate columns one by one with intermediate filtering.
+            // Get the column read order from the adaptive context.
+            const auto& read_order = _column_read_order_ctx->get_column_read_order();
+            size_t round_cost = 0;
+            double first_selectivity = -1;
+
+            // We accumulate a combined filter across all predicate columns.
+            IColumn::Filter combined_filter;
+            bool has_combined_filter = false;
+            bool can_filter_all = false;
+
+            // We need to read columns with filter_map from previously-evaluated predicates.
+            // For the first column, there's no filter. For subsequent columns, we pass the
+            // accumulated filter_map so filtered rows can be skipped at the decoder level.
+            FilterMap intermediate_filter_map;
+
+            for (size_t round = 0; round < read_order.size(); ++round) {
+                size_t col_idx = read_order[round];
+                const std::string& col_name = pred_col_names[col_idx];
+
+                round_cost += _column_read_order_ctx->get_column_cost(col_idx);
+
+                // Read this single predicate column.
+                std::vector<std::string> single_col = {col_name};
+                size_t col_read_rows = 0;
+                bool col_eof = false;
+                RETURN_IF_ERROR(_read_column_data(block, single_col, batch_size, &col_read_rows,
+                                                  &col_eof, intermediate_filter_map));
+
+                if (round == 0) {
+                    pre_read_rows = col_read_rows;
+                    pre_eof = col_eof;
+                }
+
+                // Evaluate per-column conjuncts if this column has any.
+                auto conj_it = _per_col_conjuncts.find(col_idx);
+                if (conj_it != _per_col_conjuncts.end() && !conj_it->second.empty()) {
+                    // Need to fill partition/missing columns that this conjunct may reference
+                    // before evaluating. (Partition/missing conjuncts are handled separately.)
+                    bool resize_first_column = _lazy_read_ctx.resize_first_column;
+                    if (resize_first_column && _iceberg_rowid_params.enabled) {
+                        int row_id_idx =
+                                block->get_position_by_name(doris::BeConsts::ICEBERG_ROWID_COL);
+                        if (row_id_idx == 0) {
+                            resize_first_column = false;
+                        }
+                    }
+                    if (resize_first_column) {
+                        block->get_by_position(0).column->assume_mutable()->resize(pre_read_rows);
+                    }
+
+                    IColumn::Filter col_filter(pre_read_rows, static_cast<unsigned char>(1));
+                    bool col_can_filter_all = false;
+
+                    // Apply existing combined_filter as a pre-filter
+                    std::vector<IColumn::Filter*> filters;
+                    if (has_combined_filter) {
+                        filters.push_back(&combined_filter);
+                    }
+
+                    {
+                        SCOPED_RAW_TIMER(&_predicate_filter_time);
+                        RETURN_IF_ERROR(VExprContext::execute_conjuncts(conj_it->second, &filters,
+                                                                        block, &col_filter,
+                                                                        &col_can_filter_all));
+                    }
+
+                    if (resize_first_column) {
+                        block->get_by_position(0).column->assume_mutable()->clear();
+                    }
 
-        // 3. Get dict codes.
-        std::vector<int32_t> dict_codes;
-        for (size_t i = 0; i < result_filter.size(); ++i) {
-            if (result_filter[i]) {
-                dict_codes.emplace_back(i);
+                    if (col_can_filter_all) {
+                        can_filter_all = true;
+                        if (first_selectivity < 0) {
+                            first_selectivity = 0;
+                        }
+                        break;
+                    }
+
+                    // Merge col_filter into combined_filter
+                    if (!has_combined_filter) {
+                        combined_filter = std::move(col_filter);
+                        has_combined_filter = true;
+                    } else {
+                        for (size_t i = 0; i < pre_read_rows; ++i) {
+                            combined_filter[i] &= col_filter[i];
+                        }
+                    }
+
+                    if (first_selectivity < 0 && has_combined_filter) {
+                        size_t hit = 0;
+                        for (size_t i = 0; i < pre_read_rows; ++i) {
+                            hit += combined_filter[i];
+                        }
+                        first_selectivity =
+                                static_cast<double>(hit) / static_cast<double>(pre_read_rows);
+                    }
+
+                    // Update intermediate_filter_map for subsequent columns.
+                    // This lets the next column's reader skip filtered rows at decode level.
+                    if (has_combined_filter && round + 1 < read_order.size()) {
+                        // Check if all rows are filtered
+                        bool all_filtered = true;
+                        for (size_t i = 0; i < pre_read_rows; ++i) {
+                            if (combined_filter[i]) {
+                                all_filtered = false;
+                                break;
+                            }
+                        }
+                        if (all_filtered) {
+                            can_filter_all = true;
+                            break;
+                        }
+                        RETURN_IF_ERROR(intermediate_filter_map.init(combined_filter.data(),
+                                                                     pre_read_rows, false));
+                    }
+                }
             }
-        }
 
-        // About Performance: if dict_column size is too large, it will generate a large IN filter.
-        if (dict_codes.size() > MAX_DICT_CODE_PREDICATE_TO_REWRITE) {
-            it = _dict_filter_cols.erase(it);
-            for (auto& ctx : ctxs) {
-                _filter_conjuncts.push_back(ctx);
+            _column_read_order_ctx->update(round_cost,
+                                           first_selectivity >= 0 ? first_selectivity : 1);
+
+            if (pre_read_rows == 0) {
+                DCHECK_EQ(pre_eof, true);
+                break;
             }
-            continue;
-        }
+            pre_raw_read_rows += pre_read_rows;
 
-        // 4. Rewrite conjuncts.
-        RETURN_IF_ERROR(_rewrite_dict_conjuncts(
-                dict_codes, slot_id, temp_block.get_by_position(dict_pos).column->is_nullable()));
-        ++it;
-    }
-    return Status::OK();
-}
+            // Fill partition and missing columns for predicate evaluation
+            RETURN_IF_ERROR(_fill_partition_columns(block, pre_read_rows,
+                                                    _lazy_read_ctx.predicate_partition_columns));
+            RETURN_IF_ERROR(_fill_missing_columns(block, pre_read_rows,
+                                                  _lazy_read_ctx.predicate_missing_columns));
+            RETURN_IF_ERROR(_fill_row_id_columns(block, pre_read_rows, false));
+            RETURN_IF_ERROR(_append_iceberg_rowid_column(block, pre_read_rows, false));
 
-Status RowGroupReader::_rewrite_dict_conjuncts(std::vector<int32_t>& dict_codes, int slot_id,
-                                               bool is_nullable) {
-    VExprSPtr root;
-    if (dict_codes.size() == 1) {
-        {
-            TFunction fn;
-            TFunctionName fn_name;
-            fn_name.__set_db_name("");
-            fn_name.__set_function_name("eq");
-            fn.__set_name(fn_name);
-            fn.__set_binary_type(TFunctionBinaryType::BUILTIN);
-            std::vector<TTypeDesc> arg_types;
-            arg_types.push_back(create_type_desc(PrimitiveType::TYPE_INT));
-            arg_types.push_back(create_type_desc(PrimitiveType::TYPE_INT));
-            fn.__set_arg_types(arg_types);
-            fn.__set_ret_type(create_type_desc(PrimitiveType::TYPE_BOOLEAN));
-            fn.__set_has_var_args(false);
-
-            TExprNode texpr_node;
-            texpr_node.__set_type(create_type_desc(PrimitiveType::TYPE_BOOLEAN));
-            texpr_node.__set_node_type(TExprNodeType::BINARY_PRED);
-            texpr_node.__set_opcode(TExprOpcode::EQ);
-            texpr_node.__set_fn(fn);
-            texpr_node.__set_num_children(2);
-            texpr_node.__set_is_nullable(is_nullable);
-            root = VectorizedFnCall::create_shared(texpr_node);
-        }
-        {
-            SlotDescriptor* slot = nullptr;
-            const std::vector<SlotDescriptor*>& slots = _tuple_descriptor->slots();
-            for (auto each : slots) {
-                if (each->id() == slot_id) {
-                    slot = each;
-                    break;
+            RETURN_IF_ERROR(_build_pos_delete_filter(pre_read_rows));
+
+            // Now evaluate multi-column conjuncts and position delete filter.
+            {
+                SCOPED_RAW_TIMER(&_predicate_filter_time);
+
+                bool resize_first_column = _lazy_read_ctx.resize_first_column;
+                if (resize_first_column && _iceberg_rowid_params.enabled) {
+                    int row_id_idx =
+                            block->get_position_by_name(doris::BeConsts::ICEBERG_ROWID_COL);
+                    if (row_id_idx == 0) {
+                        resize_first_column = false;
+                    }
+                }
+
+                if (!can_filter_all) {
+                    // Initialize result_filter from combined_filter or fresh
+                    if (has_combined_filter) {
+                        result_filter = std::move(combined_filter);
+                    } else {
+                        result_filter.assign(pre_read_rows, static_cast<unsigned char>(1));
+                    }
+
+                    // Evaluate multi-column conjuncts
+                    if (!_multi_col_conjuncts.empty()) {
+                        if (resize_first_column) {
+                            block->get_by_position(0).column->assume_mutable()->resize(
+                                    pre_read_rows);
+                        }
+
+                        std::vector<IColumn::Filter*> filters;
+                        if (_position_delete_ctx.has_filter) {
+                            filters.push_back(_pos_delete_filter_ptr.get());
+                        }
+
+                        bool multi_can_filter_all = false;
+                        {
+                            SCOPED_RAW_TIMER(&_predicate_filter_time);
+                            RETURN_IF_ERROR(VExprContext::execute_conjuncts(
+                                    _multi_col_conjuncts, &filters, block, &result_filter,
+                                    &multi_can_filter_all));
+                        }
+
+                        if (resize_first_column) {
+                            block->get_by_position(0).column->assume_mutable()->clear();
+                        }
+
+                        if (multi_can_filter_all) {
+                            can_filter_all = true;
+                        }
+                    } else if (_position_delete_ctx.has_filter) {
+                        // Apply position delete filter to result_filter
+                        const auto* pos_filter = _pos_delete_filter_ptr->data();
+                        for (size_t i = 0; i < pre_read_rows; ++i) {
+                            result_filter[i] &= pos_filter[i];
+                        }
+                        // Check if all filtered
+                        bool all_zero = true;
+                        for (size_t i = 0; i < pre_read_rows; ++i) {
+                            if (result_filter[i]) {
+                                all_zero = false;
+                                break;
+                            }
+                        }
+                        if (all_zero) {
+                            can_filter_all = true;
+                        }
+                    }
+                } else {
+                    result_filter.assign(pre_read_rows, static_cast<unsigned char>(0));
                 }
             }
-            root->add_child(VSlotRef::create_shared(slot));
+
+            const uint8_t* __restrict filter_map_data = result_filter.data();
+            filter_map_ptr = std::make_unique<FilterMap>();
+            RETURN_IF_ERROR(filter_map_ptr->init(filter_map_data, pre_read_rows, can_filter_all));
+            if (filter_map_ptr->filter_all()) {
+                {
+                    SCOPED_RAW_TIMER(&_predicate_filter_time);
+                    for (const auto& col : _lazy_read_ctx.predicate_columns.first) {
+                        block->get_by_position((*_col_name_to_block_idx)[col])
+                                .column->assume_mutable()
+                                ->clear();
+                    }
+                    for (const auto& col : _lazy_read_ctx.predicate_partition_columns) {
+                        block->get_by_position((*_col_name_to_block_idx)[col.first])
+                                .column->assume_mutable()
+                                ->clear();
+                    }
+                    for (const auto& col : _lazy_read_ctx.predicate_missing_columns) {
+                        block->get_by_position((*_col_name_to_block_idx)[col.first])
+                                .column->assume_mutable()
+                                ->clear();
+                    }
+                    if (_row_id_column_iterator_pair.first != nullptr) {
+                        block->get_by_position(_row_id_column_iterator_pair.second)
+                                .column->assume_mutable()
+                                ->clear();
+                    }
+                    if (_iceberg_rowid_params.enabled) {
+                        int row_id_idx =
+                                block->get_position_by_name(doris::BeConsts::ICEBERG_ROWID_COL);
+                        if (row_id_idx >= 0) {
+                            block->get_by_position(static_cast<size_t>(row_id_idx))
+                                    .column->assume_mutable()
+                                    ->clear();
+                        }
+                    }
+                    Block::erase_useless_column(block, origin_column_num);
+                }
+
+                if (!pre_eof) {
+                    _cached_filtered_rows += pre_read_rows;
+                    if (pre_raw_read_rows >= config::doris_scanner_row_num) {
+                        *read_rows = 0;
+                        _convert_dict_cols_to_string_cols(block);
+                        return Status::OK();
+                    }
+                } else {
+                    *read_rows = 0;
+                    *batch_eof = true;
+                    _lazy_read_filtered_rows += (pre_read_rows + _cached_filtered_rows);
+                    _convert_dict_cols_to_string_cols(block);
+                    return Status::OK();
+                }
+            } else {
+                break;
+            }
         }
-        {
-            TExprNode texpr_node;
-            texpr_node.__set_node_type(TExprNodeType::INT_LITERAL);
-            texpr_node.__set_type(create_type_desc(TYPE_INT));
-            TIntLiteral int_literal;
-            int_literal.__set_value(dict_codes[0]);
-            texpr_node.__set_int_literal(int_literal);
-            texpr_node.__set_is_nullable(is_nullable);
-            root->add_child(VLiteral::create_shared(texpr_node));
+        if (_state->is_cancelled()) {
+            return Status::Cancelled("cancelled");
         }
-    } else {
-        {
-            TTypeDesc type_desc = create_type_desc(PrimitiveType::TYPE_BOOLEAN);
-            TExprNode node;
-            node.__set_type(type_desc);
-            node.__set_node_type(TExprNodeType::IN_PRED);
-            node.in_predicate.__set_is_not_in(false);
-            node.__set_opcode(TExprOpcode::FILTER_IN);
-            // VdirectInPredicate assume is_nullable = false.
-            node.__set_is_nullable(false);
-
-            std::shared_ptr<HybridSetBase> hybrid_set(
-                    create_set(PrimitiveType::TYPE_INT, dict_codes.size(), false));
-            for (int j = 0; j < dict_codes.size(); ++j) {
-                hybrid_set->insert(&dict_codes[j]);
-            }
-            root = vectorized::VDirectInPredicate::create_shared(node, hybrid_set);
+
+        if (filter_map_ptr == nullptr) {
+            DCHECK_EQ(pre_read_rows + _cached_filtered_rows, 0);
+            *read_rows = 0;
+            *batch_eof = true;
+            return Status::OK();
+        }
+
+        FilterMap& filter_map = *filter_map_ptr;
+        DorisUniqueBufferPtr<uint8_t> rebuild_filter_map = nullptr;
+        if (_cached_filtered_rows != 0) {
+            RETURN_IF_ERROR(_rebuild_filter_map(filter_map, rebuild_filter_map, pre_read_rows));
+            pre_read_rows += _cached_filtered_rows;
+            _cached_filtered_rows = 0;
         }
+
+        // Phase 2: Read lazy columns (same as original _do_lazy_read)
+        size_t lazy_read_rows;
+        bool lazy_eof;
+        RETURN_IF_ERROR(_read_column_data(block, _lazy_read_ctx.lazy_read_columns, pre_read_rows,
+                                          &lazy_read_rows, &lazy_eof, filter_map));
+
+        if (pre_read_rows != lazy_read_rows) {
+            return Status::Corruption("Can't read the same number of rows when doing lazy read");
+        }
+
+        // Filter data in predicate columns and remove filter column
         {
-            SlotDescriptor* slot = nullptr;
-            const std::vector<SlotDescriptor*>& slots = _tuple_descriptor->slots();
-            for (auto each : slots) {
-                if (each->id() == slot_id) {
-                    slot = each;
-                    break;
+            SCOPED_RAW_TIMER(&_predicate_filter_time);
+            if (filter_map.has_filter()) {
+                std::vector<uint32_t> predicate_columns = _lazy_read_ctx.all_predicate_col_ids;
+                if (_iceberg_rowid_params.enabled) {
+                    int row_id_idx =
+                            block->get_position_by_name(doris::BeConsts::ICEBERG_ROWID_COL);
+                    if (row_id_idx >= 0 &&
+                        std::find(predicate_columns.begin(), predicate_columns.end(),
+                                  static_cast<uint32_t>(row_id_idx)) == predicate_columns.end()) {
+                        predicate_columns.push_back(static_cast<uint32_t>(row_id_idx));
+                    }
                 }
+                RETURN_IF_CATCH_EXCEPTION(
+                        Block::filter_block_internal(block, predicate_columns, result_filter));
+                Block::erase_useless_column(block, origin_column_num);
+            } else {
+                Block::erase_useless_column(block, origin_column_num);
             }
-            root->add_child(VSlotRef::create_shared(slot));
         }
-    }
-    VExprContextSPtr rewritten_conjunct_ctx = VExprContext::create_shared(root);
-    RETURN_IF_ERROR(rewritten_conjunct_ctx->prepare(_state, *_row_descriptor));
-    RETURN_IF_ERROR(rewritten_conjunct_ctx->open(_state));
-    _dict_filter_conjuncts.push_back(rewritten_conjunct_ctx);
-    _filter_conjuncts.push_back(rewritten_conjunct_ctx);
-    return Status::OK();
-}
 
-void RowGroupReader::_convert_dict_cols_to_string_cols(Block* block) {
-    for (auto& dict_filter_cols : _dict_filter_cols) {
-        if (!_col_name_to_block_idx->contains(dict_filter_cols.first)) {
-            throw Exception(ErrorCode::INTERNAL_ERROR,
-                            "Wrong read column '{}' in parquet file, block: {}",
-                            dict_filter_cols.first, block->dump_structure());
-        }
-        ColumnWithTypeAndName& column_with_type_and_name =
-                block->get_by_position((*_col_name_to_block_idx)[dict_filter_cols.first]);
-        const ColumnPtr& column = column_with_type_and_name.column;
-        if (const auto* nullable_column = check_and_get_column<ColumnNullable>(*column)) {
-            const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr();
-            const auto* dict_column = assert_cast<const ColumnInt32*>(nested_column.get());
-            DCHECK(dict_column);
-
-            MutableColumnPtr string_column =
-                    _column_readers[dict_filter_cols.first]->convert_dict_column_to_string_column(
-                            dict_column);
-
-            column_with_type_and_name.type =
-                    std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
-            block->replace_by_position(
-                    (*_col_name_to_block_idx)[dict_filter_cols.first],
-                    ColumnNullable::create(std::move(string_column),
-                                           nullable_column->get_null_map_column_ptr()));
-        } else {
-            const auto* dict_column = assert_cast<const ColumnInt32*>(column.get());
-            MutableColumnPtr string_column =
-                    _column_readers[dict_filter_cols.first]->convert_dict_column_to_string_column(
-                            dict_column);
-
-            column_with_type_and_name.type = std::make_shared<DataTypeString>();
-            block->replace_by_position((*_col_name_to_block_idx)[dict_filter_cols.first],
-                                       std::move(string_column));
+        _convert_dict_cols_to_string_cols(block);
+        _convert_lazy_dict_cols_to_string_cols(block);
+
+        size_t column_num = block->columns();
+        size_t column_size = 0;
+        for (int i = 0; i < column_num; ++i) {
+            size_t cz = block->get_by_position(i).column->size();
+            if (column_size != 0 && cz != 0) {
+                DCHECK_EQ(column_size, cz);
+            }
+            if (cz != 0) {
+                column_size = cz;
+            }
         }
+        _lazy_read_filtered_rows += pre_read_rows - column_size;
+        *read_rows = column_size;
+
+        *batch_eof = pre_eof;
+        RETURN_IF_ERROR(
+                _fill_partition_columns(block, column_size, _lazy_read_ctx.partition_columns));
+        RETURN_IF_ERROR(_fill_missing_columns(block, column_size, _lazy_read_ctx.missing_columns));
+        return Status::OK();
     }
-}
 
-ParquetColumnReader::ColumnStatistics RowGroupReader::merged_column_statistics() {
-    ParquetColumnReader::ColumnStatistics st;
-    for (auto& reader : _column_readers) {
-        auto ost = reader.second->column_statistics();
-        st.merge(ost);
+    ParquetColumnReader::ColumnStatistics RowGroupReader::merged_column_statistics() {
+        ParquetColumnReader::ColumnStatistics st;
+        for (auto& reader : _column_readers) {
+            auto ost = reader.second->column_statistics();
+            st.merge(ost);
+        }
+        return st;
     }
-    return st;
-}
 #include "common/compile_check_end.h"
 
 } // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
index 0cf2b36eb1b6bd..01784d746cec19 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h
@@ -21,6 +21,7 @@
 
 #include <limits>
 #include <memory>
+#include <set>
 #include <string>
 #include <tuple>
 #include <unordered_map>
@@ -31,6 +32,7 @@
 #include "olap/id_manager.h"
 #include "olap/utils.h"
 #include "vec/columns/column.h"
+#include "vec/exec/format/parquet/column_read_order_ctx.h"
 #include "vec/exec/format/parquet/parquet_common.h"
 #include "vec/exec/format/table/table_format_reader.h"
 #include "vec/exprs/vexpr_fwd.h"
@@ -116,6 +118,10 @@ class RowGroupReader : public ProfileCollector {
         std::unordered_map<std::string, VExprContextSPtr> missing_columns;
         // should turn off filtering by page index, lazy read and dict filter if having complex type
         bool has_complex_type = false;
+        // P0-3: Candidate lazy string columns for deferred dict decode.
+        // Populated in set_fill_columns(): lazy read columns whose slot type is string/BYTE_ARRAY.
+        // std::pair<col_name, slot_id>
+        std::vector<std::pair<std::string, int>> lazy_dict_decode_candidates;
     };
 
     /**
@@ -214,6 +220,9 @@ class RowGroupReader : public ProfileCollector {
                              FilterMap& filter_map);
 
     Status _do_lazy_read(Block* block, size_t batch_size, size_t* read_rows, bool* batch_eof);
+    // Per-column lazy read: reads predicate columns one by one with intermediate filtering.
+    Status _do_lazy_read_per_column(Block* block, size_t batch_size, size_t* read_rows,
+                                    bool* batch_eof);
     Status _rebuild_filter_map(FilterMap& filter_map,
                                DorisUniqueBufferPtr<uint8_t>& filter_map_data,
                                size_t pre_read_rows) const;
@@ -235,6 +244,12 @@ class RowGroupReader : public ProfileCollector {
     Status _rewrite_dict_predicates();
     Status _rewrite_dict_conjuncts(std::vector<int32_t>& dict_codes, int slot_id, bool is_nullable);
     void _convert_dict_cols_to_string_cols(Block* block);
+    // P0-3: Convert lazy dict decode columns (ColumnInt32) back to string columns.
+    // Called after filtering so only surviving rows are materialized.
+    void _convert_lazy_dict_cols_to_string_cols(Block* block);
+
+    // Recursively collects all slot IDs referenced by an expression tree.
+    static void _collect_slot_ids_from_expr(const VExpr* expr, std::set<int>& slot_ids);
 
     Status _get_current_batch_row_id(size_t read_rows);
     Status _fill_row_id_columns(Block* block, size_t read_rows, bool is_current_row_ids);
@@ -269,6 +284,10 @@ class RowGroupReader : public ProfileCollector {
     VExprContextSPtrs _filter_conjuncts;
     // std::pair<col_name, slot_id>
     std::vector<std::pair<std::string, int>> _dict_filter_cols;
+    // P0-3: Lazy string columns confirmed as fully dict-encoded. These will output
+    // int32 dict codes during Phase 2 read, then be converted back to strings after filtering.
+    // std::pair<col_name, slot_id>
+    std::vector<std::pair<std::string, int>> _lazy_dict_decode_cols;
     RuntimeState* _state = nullptr;
     std::shared_ptr<ObjectPool> _obj_pool;
     const std::set<uint64_t>& _column_ids;
@@ -281,6 +300,17 @@ class RowGroupReader : public ProfileCollector {
     std::vector<rowid_t> _current_batch_row_ids;
 
     std::unordered_map<std::string, uint32_t>* _col_name_to_block_idx = nullptr;
+
+    // P0-2: Per-column predicate read order optimization
+    // Maps predicate column index (in predicate_columns arrays) to its single-slot conjuncts.
+    // Built from _slot_id_to_filter_conjuncts during init().
+    std::unordered_map<size_t, VExprContextSPtrs> _per_col_conjuncts;
+    // Conjuncts that reference multiple slots or no specific slot (evaluated after all pred cols).
+    VExprContextSPtrs _multi_col_conjuncts;
+    // Adaptive column read order context.
+    std::unique_ptr<ColumnReadOrderCtx> _column_read_order_ctx;
+    // Whether per-column lazy read optimization is active for this row group.
+    bool _enable_per_column_lazy_read = false;
 };
 #include "common/compile_check_end.h"
 
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index 606ec6b123427c..cf714d10fba108 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -26,6 +26,7 @@
 #include <functional>
 #include <utility>
 
+#include "common/config.h"
 #include "common/status.h"
 #include "exec/schema_scanner.h"
 #include "io/file_factory.h"
@@ -560,6 +561,43 @@ Status ParquetReader::set_fill_columns(
         _lazy_read_ctx.all_predicate_col_ids.emplace_back(_row_id_column_iterator_pair.second);
     }
 
+    // P0-3: Identify candidate lazy string columns for deferred dict decode.
+    // A candidate is a lazy read column that is: string-typed slot, BYTE_ARRAY physical type,
+    // and not a complex type. Actual dict-encoding confirmation happens per row group in init().
+    if (config::enable_parquet_lazy_dict_decode_for_lazy_columns &&
+        !_lazy_read_ctx.has_complex_type && _colname_to_slot_id != nullptr &&
+        _tuple_descriptor != nullptr) {
+        for (const auto& lazy_col : _lazy_read_ctx.lazy_read_columns) {
+            auto slot_id_it = _colname_to_slot_id->find(lazy_col);
+            if (slot_id_it == _colname_to_slot_id->end()) {
+                continue;
+            }
+            int slot_id = slot_id_it->second;
+            // Find the SlotDescriptor to check slot type
+            SlotDescriptor* slot = nullptr;
+            for (auto* each : _tuple_descriptor->slots()) {
+                if (each->id() == slot_id) {
+                    slot = each;
+                    break;
+                }
+            }
+            if (slot == nullptr) {
+                continue;
+            }
+            if (!is_string_type(slot->type()->get_primitive_type()) &&
+                !is_var_len_object(slot->type()->get_primitive_type())) {
+                continue;
+            }
+            // Check parquet physical type is BYTE_ARRAY
+            auto file_col_name = _table_info_node_ptr->children_file_column_name(lazy_col);
+            auto* field = schema.get_column(file_col_name);
+            if (field == nullptr || field->physical_type != tparquet::Type::BYTE_ARRAY) {
+                continue;
+            }
+            _lazy_read_ctx.lazy_dict_decode_candidates.emplace_back(lazy_col, slot_id);
+        }
+    }
+
     for (auto& kv : _lazy_read_ctx.fill_partition_columns) {
         auto iter = predicate_columns.find(kv.first);
         if (iter == predicate_columns.end()) {
diff --git a/docs/P0-1_Filter_Bitmap_Pushdown_Test_Report.md b/docs/P0-1_Filter_Bitmap_Pushdown_Test_Report.md
new file mode 100644
index 00000000000000..dc87cef77e69c3
--- /dev/null
+++ b/docs/P0-1_Filter_Bitmap_Pushdown_Test_Report.md
@@ -0,0 +1,383 @@
+# P0-1 Filter Bitmap 下推到 Decoder 层 — 测试文档
+
+## 1. 功能概述
+
+本优化为 Doris Parquet Reader 的 P0-1 优化项：**Filter Bitmap 下推到 Decoder 层**，实现了懒惰字典索引解码（Lazy Dict Index Decoding）。
+
+### 1.1 优化目标
+
+在低选择率场景（存活行 < 5%）下，避免对所有非空行进行 RLE 字典索引解码，改为：
+- **CONTENT 行**（存活行）：按需解码 RLE 索引，再做字典查找
+- **FILTERED_CONTENT 行**（被过滤行）：通过 `RleBatchDecoder::SkipBatch()` 直接跳过 RLE 数据流，不解码
+
+### 1.2 核心对比
+
+| | 原始路径（Eager） | 优化路径（Lazy） |
+|---|---|---|
+| 索引解码 | 一次性 `GetBatch` 解码全部非空索引 | 按 run 分段：CONTENT 用 `GetBatch`，FILTERED_CONTENT 用 `SkipBatch` |
+| 字典查找 | CONTENT 做查找，FILTERED_CONTENT 跳过 index | CONTENT 做查找，FILTERED_CONTENT 不解码不查找 |
+| 内存分配 | `_indexes.resize(non_null_size)` 全量 | `_indexes.resize(run_length)` 按需 |
+| RLE 跳过方式 | 无 | `SkipBatch` 以 32 值为对齐单位的快速字节跳过 |
+
+---
+
+## 2. 修改文件清单
+
+### 2.1 核心修改
+
+| 文件 | 修改内容 | 重要程度 |
+|------|----------|----------|
+| `be/src/util/rle_encoding.h` | 新增 `RleBatchDecoder::SkipBatch()` 方法 | 高 |
+| `be/src/vec/exec/format/parquet/decoder.h` | `Decoder::decode_values()` 增加 `filter_data` 参数；`BaseDictDecoder::skip_values()` 使用 SkipBatch | 高 |
+| `be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp` | 新增 `_lazy_decode_fixed_values()` 懒惰解码路径 | 高 |
+| `be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp` | 新增 `_lazy_decode_string_values()` 懒惰解码路径 | 高 |
+| `be/src/vec/exec/format/parquet/byte_array_dict_decoder.h` | 新增 `_lazy_decode_string_values()` 声明 | 中 |
+| `be/src/vec/exec/format/parquet/vparquet_column_reader.cpp` | 选择率计算 + `filter_data` 传递逻辑 | 高 |
+| `be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h/.cpp` | `decode_values()` 增加 `filter_data` 参数并透传 | 中 |
+| `be/src/common/config.h` / `config.cpp` | 新增配置项 `enable_parquet_lazy_dict_decode` | 中 |
+
+### 2.2 签名更新（仅参数变更，无功能改动）
+
+| 文件 | 修改内容 |
+|------|----------|
+| `be/src/vec/exec/format/parquet/fix_length_plain_decoder.h/.cpp` | `decode_values()` 增加 `filter_data` 默认参数 |
+| `be/src/vec/exec/format/parquet/byte_array_plain_decoder.h/.cpp` | 同上 |
+| `be/src/vec/exec/format/parquet/byte_stream_split_decoder.h/.cpp` | 同上 |
+| `be/src/vec/exec/format/parquet/bool_plain_decoder.h/.cpp` | 同上 |
+| `be/src/vec/exec/format/parquet/bool_rle_decoder.h/.cpp` | 同上 |
+| `be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h` | 3 个内联 `decode_values()` 签名更新 |
+
+### 2.3 新增文件
+
+| 文件 | 说明 |
+|------|------|
+| `be/benchmark/benchmark_parquet_dict_decoder.hpp` | 微基准测试：字典解码器 + RLE SkipBatch |
+
+---
+
+## 3. 配置项
+
+### 3.1 `enable_parquet_lazy_dict_decode`
+
+| 属性 | 值 |
+|------|-----|
+| 类型 | mBool（运行时可修改） |
+| 默认值 | `true` |
+| 作用 | 控制是否启用 Parquet 字典解码器的懒惰索引解码优化 |
+| 关闭方式 | `curl http://be_host:webserver_port/api/update_config?enable_parquet_lazy_dict_decode=false` |
+
+### 3.2 触发条件
+
+懒惰解码路径在同时满足以下条件时激活：
+1. `enable_parquet_lazy_dict_decode = true`（配置开关打开）
+2. `filter_map.has_filter() = true`（存在过滤条件）
+3. `filter_map.filter_ratio() > 0.95`（超过 95% 的行被过滤，即存活率 < 5%）
+4. 列不是 `ColumnDictionary` 类型，且不是 `is_dict_filter` 模式
+
+代码位置：`vparquet_column_reader.cpp:398-407`
+
+```cpp
+const uint8_t* filter_data = nullptr;
+if (config::enable_parquet_lazy_dict_decode && filter_map.has_filter() &&
+    filter_map.filter_ratio() > 0.95) {
+    filter_data = filter_map.filter_map_data();
+}
+return _chunk_reader->decode_values(data_column, type, select_vector, is_dict_filter,
+                                    filter_data);
+```
+
+---
+
+## 4. 技术实现细节
+
+### 4.1 RleBatchDecoder::SkipBatch() 实现
+
+**文件**: `be/src/util/rle_encoding.h:894-959`
+
+该方法在 RLE 编码数据流中跳过指定数量的值，不进行实际解码。处理三种情况：
+
+1. **Repeat Run（重复值）**：直接递减 `repeat_count_`，零成本跳过
+2. **Literal Run 已缓冲部分**：推进 `literal_buffer_pos_`，跳过已解码到缓冲区的值
+3. **Literal Run 未缓冲部分**：
+   - 以 32 值为对齐单位，调用 `bit_reader_.SkipBatch()` 进行字节级快速跳过
+   - 不足 32 值的尾部，通过 `FillLiteralBuffer()` 解码到缓冲区后推进位置
+
+**为何以 32 对齐**：`BatchedBitReader::SkipBatch()` 要求 `bit_width * num_values` 能被 8 整除。32 值 × 任意 bit_width 总能满足此约束（因为 RLE literal run 是 8 的倍数，32 是 8 的倍数）。非对齐跳过会导致字节位移错位，读取后续数据产生垃圾值。
+
+### 4.2 懒惰解码路径
+
+以 `FixLengthDictDecoder::_lazy_decode_fixed_values()` 为例（`fix_length_dict_decoder.hpp:194-242`）：
+
+```
+Loop over ColumnSelectVector runs:
+  CONTENT:
+    _indexes.resize(run_length)
+    _index_batch_decoder->GetBatch(_indexes.data(), run_length)  // 仅解码当前 run
+    for i in 0..run_length:
+      output[i] = _dict_items[_indexes[i]]  // 字典查找
+  FILTERED_CONTENT:
+    _index_batch_decoder->SkipBatch(run_length)  // 直接跳过，不解码
+  NULL_DATA:
+    data_index += run_length * _type_length  // 填充默认值
+  FILTERED_NULL:
+    // 什么都不做
+```
+
+`ByteArrayDictDecoder::_lazy_decode_string_values()` 逻辑相同，区别仅在字典值类型为变长字符串。
+
+### 4.3 调用链路
+
+```
+ScalarColumnReader::_read_values(filter_map)
+  → 计算 filter_ratio，决定是否传递 filter_data
+  → ColumnChunkReader::decode_values(select_vector, is_dict_filter, filter_data)
+    → Decoder::decode_values(column, type, select_vector, is_dict_filter, filter_data)
+      → filter_data != nullptr 时进入懒惰解码路径
+      → filter_data == nullptr 时走原始路径（全量解码后遍历 run）
+```
+
+---
+
+## 5. 测试方案
+
+### 5.1 微基准测试（已完成）
+
+#### 5.1.1 构建与运行
+
+```bash
+# 构建
+cd be/build_benchmark
+ninja -j 10 benchmark_test
+
+# 运行全部 Parquet 相关 benchmark
+export JAVA_HOME=/path/to/jdk17
+export LD_LIBRARY_PATH="${JAVA_HOME}/lib/server:${JAVA_HOME}/lib:${LD_LIBRARY_PATH}"
+export DORIS_HOME=$(pwd)
+./bin/benchmark_test --benchmark_filter="BM_ByteArray|BM_FixLen|BM_Rle"
+```
+
+#### 5.1.2 基准测试用例一览
+
+| 测试名 | 参数 | 测试目标 |
+|--------|------|----------|
+| `BM_RleSkip_GetBatch` | 10K/100K/1M 值 | RLE 全量解码基线 |
+| `BM_RleSkip_SkipBatch` | 10K/100K/1M 值 | RLE SkipBatch 性能 |
+| `BM_ByteArrayDictDecode_NoFilter` | dict=100/10K/100K, sel=1-100% | 字符串字典解码原始路径 |
+| `BM_ByteArrayDictDecode_WithFilter` | dict=100/10K/100K, sel=1-100% | 字符串字典解码懒惰路径 |
+| `BM_FixLenDictDecode_NoFilter` | dict=100/1M, sel=5-50% | 定长字典解码原始路径 |
+| `BM_FixLenDictDecode_WithFilter` | dict=100/1M, sel=5-50% | 定长字典解码懒惰路径 |
+
+#### 5.1.3 基准测试结果
+
+**测试环境**：16 核 CPU，L1D 48KB×8, L2 1280KB×8, L3 49152KB×1
+
+##### RLE SkipBatch vs GetBatch
+
+| 数据量 | GetBatch (µs) | SkipBatch (µs) | 加速比 |
+|--------|---------------|----------------|--------|
+| 10K | 3.94 | 0.59 | **6.7x** |
+| 100K | 33.4 | 3.92 | **8.5x** |
+| 1M | 341 | 38.2 | **8.9x** |
+
+**结论**：SkipBatch 相比 GetBatch 有 **6.7-8.9 倍**的性能提升，验证了 RLE 跳过的有效性。
+
+##### ByteArray 字典解码（dict=100K，大字典）
+
+| 存活率 | NoFilter (µs) | WithFilter (µs) | 对比 |
+|--------|---------------|-----------------|------|
+| 1% | 239 | 603 | +152%（回退） |
+| 5% | 498 | 620 | +24%（回退） |
+| 20% | 1273 | 1656 | +30%（回退） |
+| 50% | 3878 | 3373 | -13%（提升） |
+| 100% | 2555 | 2736 | +7%（回退） |
+
+##### FixLen 字典解码（dict=1M，大字典）
+
+| 存活率 | NoFilter (µs) | WithFilter (µs) | 对比 |
+|--------|---------------|-----------------|------|
+| 5% | 707 | 629 | **-11%（提升）** |
+| 20% | 880 | 1114 | +27%（回退） |
+| 50% | 1370 | 2005 | +46%（回退） |
+
+##### 性能分析
+
+1. **RLE SkipBatch 本身非常高效**，相比 GetBatch 有 6-9 倍提升。
+2. **FixLen 类型在低选择率时有明显收益**（dict=1M, sel=5% 时提升 11%）。
+3. **ByteArray 类型的懒惰路径存在额外开销**，原因是：
+   - 每个 CONTENT run 需要独立调用 `insert_many_strings_overflow`，而原始路径只在最外层按 run 调用
+   - Per-run 的 `GetBatch` 调用开销累积大于一次性 `GetBatch` 的开销
+4. **因此生产环境触发阈值设为 filter_ratio > 0.95**（存活率 < 5%），仅在极端低选择率场景才启用，最小化回退风险。
+
+### 5.2 功能正确性测试方案
+
+#### 5.2.1 单元测试（建议补充）
+
+需要编写的单元测试覆盖以下场景：
+
+**RleBatchDecoder::SkipBatch 正确性**
+
+| 测试场景 | 验证点 |
+|----------|--------|
+| 跳过完整 repeat run | SkipBatch(N) 后 GetBatch 读到正确下一个值 |
+| 跳过完整 literal run | SkipBatch(N) 后 GetBatch 读到正确值 |
+| 跳过部分 repeat run | 跳过 run 的前半段，GetBatch 读后半段 |
+| 跳过部分 literal run（< 32 值）| 触发 FillLiteralBuffer 的 buffer 路径 |
+| 跳过部分 literal run（>= 32 值）| 触发 SkipBatch 的 32-对齐字节跳过路径 |
+| 混合交替跳过和读取 | Skip(10) → Get(5) → Skip(20) → Get(10) → ... |
+| 跳过全部值 | SkipBatch(total_count) 返回 total_count |
+| 跳过超过剩余值的数量 | SkipBatch(total+100) 返回 total_count（不崩溃） |
+| bit_width 边界值 | bit_width=1, 8, 16, 32 |
+
+**懒惰字典解码正确性**
+
+| 测试场景 | 验证点 |
+|----------|--------|
+| 全部 CONTENT（无过滤） | 结果与原始路径完全一致 |
+| 全部 FILTERED_CONTENT | 列为空（无新增行） |
+| 混合 CONTENT + FILTERED_CONTENT | 存活行的值正确，列大小正确 |
+| 包含 NULL_DATA + FILTERED_NULL | null 值处理正确 |
+| 极端低选择率（1 行存活/100K 行） | 该 1 行的值正确 |
+| 大字典 + 大数据量 | 无越界访问、无垃圾值 |
+| INT32/INT64/FLOAT/DOUBLE/FIXED_LEN_BYTE_ARRAY 各类型 | 类型兼容 |
+| ByteArray 字典（变长字符串） | 字符串内容和长度正确 |
+
+#### 5.2.2 集成测试（建议执行）
+
+使用 Doris 的 regression test 框架，测试真实 Parquet 文件读取：
+
+```sql
+-- 1. 基础查询：有过滤条件的 Parquet 表扫描
+SELECT * FROM parquet_table WHERE id = 12345;  -- 极低选择率
+
+-- 2. 聚合查询：低选择率 + 聚合
+SELECT count(*), sum(amount) FROM parquet_table WHERE status = 'RARE_VALUE';
+
+-- 3. 字符串列：验证变长字符串字典解码
+SELECT name, address FROM parquet_table WHERE category = 'UNCOMMON';
+
+-- 4. 多列联合过滤
+SELECT * FROM parquet_table WHERE col_a = 1 AND col_b = 'x';
+
+-- 5. 无过滤条件：验证不触发懒惰路径时无回退
+SELECT count(*) FROM parquet_table;
+
+-- 6. 高选择率：验证不触发懒惰路径
+SELECT * FROM parquet_table WHERE id > 0;  -- 几乎全部存活
+
+-- 7. 配置开关关闭时应走原始路径
+-- SET enable_parquet_lazy_dict_decode = false;
+-- 重复上述查询，验证结果一致
+```
+
+**外表类型覆盖**：
+- Hive 外表（Parquet 格式）
+- Iceberg 外表（Parquet 格式）
+- 直接 `SELECT * FROM S3()` 读取 Parquet 文件
+
+### 5.3 回归测试方案
+
+#### 5.3.1 配置开关对比测试
+
+对同一查询分别执行：
+
+| 场景 | 配置 | 预期 |
+|------|------|------|
+| A | `enable_parquet_lazy_dict_decode = true` | 结果正确，低选择率时性能持平或提升 |
+| B | `enable_parquet_lazy_dict_decode = false` | 结果正确，走原始路径 |
+
+验证：A 和 B 的查询结果完全一致（`diff` 比较）。
+
+#### 5.3.2 边界条件测试
+
+| 场景 | 描述 |
+|------|------|
+| 空 Page | 0 行数据的 Parquet page |
+| 全 null 列 | 所有行都是 null |
+| 全非 null 列 | 无 null 值 |
+| 单行 Page | 每个 page 只有 1 行 |
+| filter_ratio 恰好 0.95 | 边界不触发（需 > 0.95） |
+| filter_ratio = 1.0（全过滤） | 全部 SkipBatch，列不增长 |
+| filter_ratio = 0.0（全存活） | 不触发懒惰路径 |
+| 跨 Page 读取 | 验证 Page 切换时 RLE decoder 重置正确 |
+
+---
+
+## 6. 已知限制与风险
+
+### 6.1 性能限制
+
+- **ByteArray 类型在中等选择率（5-50%）时可能有回退**，因为 per-run `insert_many_strings_overflow` 调用频率增加。生产环境通过 `filter_ratio > 0.95` 阈值规避。
+- 当前懒惰路径不适用于 `ColumnDictionary`（Doris 内部字典列）和 `is_dict_filter` 模式，这些场景需要全量索引。
+
+### 6.2 兼容性
+
+- **无协议变更**：仅 BE 内部解码逻辑优化，不涉及存储格式、网络协议、FE 变更。
+- **向后兼容**：通过配置开关 `enable_parquet_lazy_dict_decode = false` 可完全关闭优化，回退到原始路径。
+- **所有非字典编码器**（PlainDecoder、BoolDecoder、DeltaBitPack 等）仅做签名更新，功能无变化。
+
+### 6.3 潜在风险
+
+| 风险 | 影响 | 缓解措施 |
+|------|------|----------|
+| SkipBatch 字节对齐错误 | 后续 GetBatch 读取垃圾值导致崩溃 | 已通过 32 对齐 + FillLiteralBuffer 修复并通过 benchmark 验证 |
+| 极端 bit_width 场景 | bit_width=0 或 bit_width=64 时的边界行为 | bit_width=0 表示字典仅一个值（全 repeat run），SkipBatch 只走 repeat 分支，安全 |
+| filter_ratio 计算精度 | filter_ratio 是 double，阈值比较可能有浮点精度问题 | 使用 `> 0.95` 而非 `>= 0.95`，足够宽松 |
+
+---
+
+## 7. Benchmark 复现指南
+
+### 7.1 环境准备
+
+```bash
+# 1. 确保 benchmark 构建目录存在
+ls be/build_benchmark/build.ninja
+
+# 2. 如果不存在，运行完整构建脚本
+./run-be-benchmark.sh
+
+# 3. 如果已存在，增量构建
+cd be/build_benchmark && ninja -j 10 benchmark_test
+```
+
+### 7.2 运行方式
+
+```bash
+# 设置 Java 环境（benchmark 二进制依赖 libjvm.so）
+export JAVA_HOME=/path/to/jdk17
+export LD_LIBRARY_PATH="${JAVA_HOME}/lib/server:${JAVA_HOME}/lib:${LD_LIBRARY_PATH}"
+export DORIS_HOME=$(pwd)/be/build_benchmark
+
+# 运行所有 Parquet 相关 benchmark
+./be/build_benchmark/bin/benchmark_test --benchmark_filter="BM_ByteArray|BM_FixLen|BM_Rle"
+
+# 只运行 RLE SkipBatch 测试
+./be/build_benchmark/bin/benchmark_test --benchmark_filter="BM_Rle"
+
+# 运行特定字典大小的测试
+./be/build_benchmark/bin/benchmark_test --benchmark_filter="BM_ByteArrayDictDecode.*/100000"
+```
+
+### 7.3 Benchmark 文件说明
+
+**文件**: `be/benchmark/benchmark_parquet_dict_decoder.hpp`
+
+辅助函数：
+- `build_string_dict(dict_size, avg_str_len)` — 构建 ByteArray 字典数据
+- `build_int32_dict(dict_size)` — 构建 INT32 定长字典数据
+- `build_rle_dict_indexes(num_values, dict_size)` — 生成 RLE 编码的字典索引数据
+- `build_run_length_null_map(num_values)` — 构建无 null 的 run length null map
+- `build_filter_bitmap(num_values, selectivity)` — 按给定选择率生成过滤位图
+
+参数格式为 `(dict_size, selectivity_percent, num_values_in_thousands)`。
+
+---
+
+## 8. 总结
+
+P0-1 优化通过在 Decoder 层实现懒惰字典索引解码，在极低选择率（< 5%）场景下避免了无效的 RLE 索引解码开销。核心贡献包括：
+
+1. **RleBatchDecoder::SkipBatch()** — 以 6-9 倍于 GetBatch 的速度跳过 RLE 编码数据
+2. **懒惰解码路径** — FixLengthDictDecoder 和 ByteArrayDictDecoder 均支持按 run 粒度的按需解码
+3. **生产安全** — 通过运行时可调配置 `enable_parquet_lazy_dict_decode` 和 `filter_ratio > 0.95` 阈值控制，最小化对现有查询的影响
+4. **完整调用链路** — 从 ScalarColumnReader 到 Decoder 的 filter_data 传递已打通
diff --git a/docs/P0-2_Column_Read_Order_Test_Report.md b/docs/P0-2_Column_Read_Order_Test_Report.md
new file mode 100644
index 00000000000000..f0f25c5454e556
--- /dev/null
+++ b/docs/P0-2_Column_Read_Order_Test_Report.md
@@ -0,0 +1,453 @@
+# P0-2 谓词列读取顺序优化 — 测试文档
+
+## 1. 功能概述
+
+本优化为 Doris Parquet Reader 的 P0-2 优化项：**谓词列读取顺序优化（Predicate Column Read Order Optimization）**，实现了按列逐一读取 + 中间过滤 + 自适应列排序。
+
+### 1.1 优化目标
+
+在多谓词列的 lazy read 场景下，将原有"一次性读取所有谓词列再统一过滤"改为"逐列读取 + 每列读后立即评估过滤"：
+- 高选择性的列先读，快速过滤掉大量行
+- 后续列只需解码存活行（借助 P0-1 的 Filter Bitmap 下推）
+- 通过自适应探索（ColumnReadOrderCtx）自动找到最优列顺序
+
+### 1.2 核心对比
+
+| | 原始路径（AllAtOnce） | 优化路径（PerColumn） |
+|---|---|---|
+| 读取方式 | 一次性读取全部谓词列 | 逐列读取，每列读后立即过滤 |
+| 过滤时机 | 全部列读完后统一评估 `_filter_conjuncts` | 每列读后评估该列的 per-col conjuncts |
+| 后续列解码量 | 全量（无中间过滤） | 仅存活行（通过 `intermediate_filter_map` 传递） |
+| 列顺序 | 固定顺序 | 自适应排序（前10批探索，之后锁定最优） |
+| 适用场景 | 通用 | 存在高选择性谓词列时收益显著 |
+
+---
+
+## 2. 修改文件清单
+
+### 2.1 核心修改
+
+| 文件 | 修改内容 | 重要程度 |
+|------|----------|----------|
+| `be/src/vec/exec/format/parquet/vparquet_group_reader.h` | 新增 `_do_lazy_read_per_column()` 声明、`_collect_slot_ids_from_expr()` 声明、新成员变量 | 高 |
+| `be/src/vec/exec/format/parquet/vparquet_group_reader.cpp` | conjunct 分类逻辑、`_do_lazy_read_per_column()` 实现（~360行）、探索分发逻辑 | 高 |
+
+### 2.2 新增文件
+
+| 文件 | 说明 |
+|------|------|
+| `be/src/vec/exec/format/parquet/column_read_order_ctx.h` | `ColumnReadOrderCtx` 类（~93行）：自适应列排序管理 |
+| `be/benchmark/benchmark_column_read_order.hpp` | 微基准测试：per-column 读取模拟 + filter 累积 + Ctx 开销 |
+
+### 2.3 配置项
+
+| 文件 | 修改内容 |
+|------|----------|
+| `be/src/common/config.h` | 新增 `enable_parquet_per_column_lazy_read` 配置 |
+| `be/src/common/config.cpp` | 对应定义 |
+
+---
+
+## 3. 配置项
+
+### 3.1 `enable_parquet_per_column_lazy_read`
+
+| 属性 | 值 |
+|------|-----|
+| 类型 | mBool（运行时可修改） |
+| 默认值 | `true` |
+| 作用 | 控制是否启用逐列谓词读取优化 |
+| 关闭方式 | `curl http://be_host:webserver_port/api/update_config?enable_parquet_per_column_lazy_read=false` |
+
+### 3.2 触发条件
+
+逐列读取路径在同时满足以下条件时激活：
+1. `enable_parquet_per_column_lazy_read = true`
+2. lazy read 模式已启用（存在谓词列和惰性列的分离）
+3. 至少一个谓词列拥有独立的 per-column conjunct（单列谓词）
+4. `ColumnReadOrderCtx` 被成功创建
+
+---
+
+## 4. 技术实现细节
+
+### 4.1 Conjunct 分类
+
+**位置**：`vparquet_group_reader.cpp::init()` (~line 257-322)
+
+遍历所有 `_filter_conjuncts`，通过 `_collect_slot_ids_from_expr()` 递归解析表达式树中引用的 slot ID：
+- **单列 conjunct** → 存入 `_per_col_conjuncts[col_idx]`
+- **多列 conjunct** → 存入 `_multi_col_conjuncts`
+
+### 4.2 ColumnReadOrderCtx 自适应排序
+
+**文件**：`column_read_order_ctx.h`
+
+| 阶段 | 前10批（探索） | 第11批起（利用） |
+|------|---------------|-----------------|
+| 列顺序 | 随机洗牌 | 锁定历史最优顺序 |
+| 代价追踪 | 记录每批的 round_cost + first_selectivity | 不再更新 |
+| 最优标准 | round_cost 最小；相同时优先 first_selectivity 小的 | — |
+
+`round_cost` = Σ(该列读取时的存活行数 × 该列的 per-row cost)
+
+### 4.3 `_do_lazy_read_per_column()` 核心流程
+
+```
+Phase 1 — 逐列读取谓词列：
+  for col in column_read_order:
+    read_column_data(col, intermediate_filter_map)   // 借助 P0-1 跳过已过滤行
+    evaluate per_col_conjuncts[col] → col_filter
+    combined_filter &= col_filter                    // 累积过滤
+    update intermediate_filter_map                   // 传递给下一列
+  
+  evaluate _multi_col_conjuncts → final_filter       // 多列联合谓词
+  combined_filter &= final_filter
+
+  if filter_all → clear & retry (while loop)
+
+Phase 2 — 读取惰性列 + 最终过滤：
+  （与原始 _do_lazy_read() 的 Phase 2 完全相同）
+```
+
+### 4.4 调用链路
+
+```
+RowGroupReader::_do_lazy_read()
+  → if (_enable_per_column_lazy_read)
+    → _do_lazy_read_per_column(block, columns, batch_size, read_rows, eof)
+        → ColumnReadOrderCtx::get_column_read_order()
+        → _read_column_data(block, single_col, batch_size, ..., &intermediate_filter_map)
+        → VExprContext::execute_conjuncts(per_col_conjuncts[col]) → col_filter
+        → combine_filters → update intermediate_filter_map
+        → VExprContext::execute_conjuncts(multi_col_conjuncts)
+        → ColumnReadOrderCtx::update(round_cost, first_selectivity)
+        → Phase 2: read lazy columns + final filter
+```
+
+---
+
+## 5. 测试方案
+
+### 5.1 微基准测试（已完成）
+
+#### 5.1.1 构建与运行
+
+```bash
+# 增量构建
+cd be/build_benchmark && ninja -j 10 benchmark_test
+
+# 运行 P0-2 benchmark
+export JAVA_HOME=/path/to/jdk17
+export LD_LIBRARY_PATH="${JAVA_HOME}/lib/server:${JAVA_HOME}/lib:${LD_LIBRARY_PATH}"
+export DORIS_HOME=$(pwd)/be/build_benchmark
+./bin/benchmark_test --benchmark_filter="BM_P02_" --benchmark_repetitions=3 --benchmark_report_aggregates_only=true
+```
+
+#### 5.1.2 基准测试用例一览
+
+新版 benchmark 将 P0-1（Filter Bitmap Pushdown）和 P0-2（Column Read Order）的效果明确分离，设置三个对比组：
+
+| 组别 | 测试名 | P0-1 | P0-2 | 测试目标 |
+|------|--------|------|------|----------|
+| 基线 | `BM_P02_AllAtOnce` | 否 | 否 | 原始路径：全部列解码全部行，再统一过滤 |
+| P0-2 only | `BM_P02_PerCol_NoPushdown_Best/Worst` | 否 | 是 | 逐列读取 + 中间过滤，但 decoder 仍解码全部行 |
+| P0-2 + P0-1 | `BM_P02_PerCol_WithPushdown_Best/Worst` | 是 | 是 | 逐列读取 + 中间过滤 + decoder 仅解码存活行 |
+| 自适应 | `BM_P02_PerCol_Adaptive` | 是 | 是 | 使用 ColumnReadOrderCtx 自适应排序（20 batches: 10 探索 + 10 利用） |
+| 辅助 | `BM_P02_FilterAccumulation` | — | — | 纯 filter 累积（bitwise AND）开销 |
+| 辅助 | `BM_P02_CtxOverhead` | — | — | ColumnReadOrderCtx 自身管理开销 |
+
+参数格式：`(num_cols, num_rows_in_thousands, scenario)`，其中 scenario: 0=skewed, 1=uniform, 2=cascading。
+
+**关键区别：两种 decode 模拟函数**
+
+| 函数 | 模拟行为 | 对应场景 |
+|------|----------|----------|
+| `p02_decode_no_pushdown(num_rows, cost, scratch)` | `memset(scratch, 0x42, num_rows * cost)` — 全量解码 | AllAtOnce / NoPushdown |
+| `p02_decode_with_pushdown(filter, num_rows, cost, scratch)` | 逐行检查 `filter[i]`，仅解码存活行 | WithPushdown / Adaptive |
+
+这一设计确保 NoPushdown 组的解码开销与 AllAtOnce 完全一致，隔离了 P0-2 单独的效果。
+
+**模拟场景说明**：
+
+| 场景 | 描述 | 实际业务映射 |
+|------|------|-------------|
+| **skewed** | 1列=1%选择率，其余=90% | 主键过滤 + 宽松辅助条件 |
+| **uniform** | 所有列=50% | 多列均匀过滤（较少见） |
+| **cascading** | 80%→60%→40%→20%递减 | 多条件逐步收窄 |
+
+#### 5.1.3 基准测试结果
+
+**测试环境**：16 核 CPU @ 3496 MHz，L1D 48KB×8, L2 1280KB×8, L3 49152KB×1
+
+##### 核心三组对比（mean time, µs）
+
+| 场景 | AllAtOnce | NoPushdown Best | NoPushdown Worst | WithPushdown Best | WithPushdown Worst |
+|------|-----------|-----------------|------------------|-------------------|--------------------|
+| 4 cols, skewed | 623 | 619 | 642 | **541** | 1535 |
+| 4 cols, uniform | 625 | 624 | 653 | 1384 | 1456 |
+| 4 cols, cascading | 619 | 629 | 640 | **898** | 1572 |
+| 8 cols, skewed | 1260 | 1246 | 1271 | **893** | 3670 |
+| 8 cols, uniform | 1269 | 1238 | 1302 | 1934 | 1912 |
+| 8 cols, cascading | 1245 | 1233 | 1283 | **1173** | 2482 |
+| 2 cols, skewed | 311 | 316 | 325 | 355 | 630 |
+
+##### P0-2 only（NoPushdown）vs AllAtOnce 对比
+
+| 场景 | AllAtOnce (µs) | NoPushdown Best (µs) | 差异 |
+|------|----------------|----------------------|------|
+| 4 cols, skewed | 623 | 619 | -0.6%（噪声范围内） |
+| 8 cols, skewed | 1260 | 1246 | -1.1%（噪声范围内） |
+| 4 cols, cascading | 619 | 629 | +1.6%（噪声范围内） |
+
+> **结论：P0-2 单独（无 P0-1）基本没有性能收益。** 由于 decoder 仍解码全部行，逐列读取无法减少解码工作量。
+
+##### P0-2 + P0-1（WithPushdown Best）vs AllAtOnce 对比
+
+| 场景 | AllAtOnce (µs) | WithPushdown Best (µs) | 加速比 |
+|------|----------------|------------------------|--------|
+| 4 cols, skewed | 623 | **541** | **1.15x** |
+| 8 cols, skewed | 1260 | **893** | **1.41x** |
+| 8 cols, cascading | 1245 | **1173** | **1.06x** |
+| 4 cols, cascading | 619 | **898** | 0.69x（退化） |
+| 4 cols, uniform | 625 | 1384 | 0.45x（严重退化） |
+
+> **结论：P0-2 的价值在于作为 P0-1 的放大器。** 当 P0-1 使 decoder 可以跳过已过滤行时，P0-2 的逐列中间过滤才能减少后续列的实际解码量。
+
+##### WithPushdown Best vs Worst（列顺序影响）
+
+| 场景 | Best (µs) | Worst (µs) | Worst/Best 倍数 |
+|------|-----------|------------|-----------------|
+| 4 cols, skewed | 541 | 1535 | **2.84x** |
+| 8 cols, skewed | 893 | 3670 | **4.11x** |
+| 8 cols, cascading | 1173 | 2482 | **2.12x** |
+| 4 cols, cascading | 898 | 1572 | **1.75x** |
+
+> **结论：在 P0-1 pushdown 生效的前提下，列顺序影响极大。** 8 列 skewed 场景最优 vs 最差差距达 4.11 倍，充分证明了自适应排序的必要性。
+
+##### Adaptive（ColumnReadOrderCtx）— 20 批次总耗时
+
+| 场景 | Adaptive 总耗时 (µs) | 每 batch 平均 (µs) | WithPushdown Best (µs) | WithPushdown Worst (µs) |
+|------|----------------------|---------------------|------------------------|-------------------------|
+| 4 cols, skewed | 17,741 | ~887 | 541 | 1535 |
+| 8 cols, skewed | — | — | 893 | 3670 |
+| 4 cols, uniform | — | — | 1384 | 1456 |
+
+> Adaptive 每 batch 平均 ~887 µs（4 cols skewed），介于 Best (541) 和 Worst (1535) 之间。10 轮探索引入了开销，但利用期锁定后趋近 Best。
+
+##### Filter 累积开销
+
+| 配置 | 耗时 (µs) | 吞吐 |
+|------|-----------|------|
+| 2 cols × 100K rows | 94 | ~2.0 GB/s |
+| 4 cols × 100K rows | 186 | ~2.0 GB/s |
+| 8 cols × 100K rows | 372 | ~2.0 GB/s |
+| 4 cols × 1M rows | 1895 | ~2.0 GB/s |
+
+> Filter AND 操作开销相对于列解码（~600-1200 µs）占比较小（<20%）。
+
+##### ColumnReadOrderCtx 管理开销
+
+| 列数 | 20 轮管理耗时 (ns) | 每 batch (ns) |
+|------|--------------------|---------------|
+| 2 | 36,255 | ~1,813 |
+| 4 | 35,785 | ~1,789 |
+| 8 | 36,147 | ~1,807 |
+| 16 | 37,275 | ~1,864 |
+
+> Ctx 管理开销 ~1.8 µs/batch，完全可忽略（相比解码的 ms 级耗时）。
+
+#### 5.1.4 性能分析
+
+**核心发现：P0-2 是 P0-1 的放大器，二者协同才能产生显著收益。**
+
+1. **P0-2 单独无收益**：NoPushdown 组与 AllAtOnce 在所有场景下差异均在噪声范围内（±1.6%）。因为 decoder 仍解码全量行，逐列读取只是改变了 filter 评估时机，无法减少主要工作量。
+
+2. **P0-2 + P0-1 在 skewed 场景收益显著**：8 列 skewed 场景加速 1.41x。机制：高选择性列先读 → P0-1 令 decoder 跳过 99% 的行 → 后续列解码量骤降。
+
+3. **列顺序在 pushdown 下影响极大**：8 列 skewed 场景 Best vs Worst 差 4.11 倍。最差顺序将低选择性列排前面，后续列仍需解码大量行，完全浪费了 P0-1 的跳过能力。
+
+4. **Uniform 场景 WithPushdown 退化**：4 列 uniform 场景 WithPushdown Best (1384 µs) 比 AllAtOnce (625 µs) 慢 2.2 倍。原因：`p02_decode_with_pushdown()` 的逐行分支检查（`if (filter[i])`）比 `p02_decode_no_pushdown()` 的批量 `memset` 开销更大，当无法通过中间过滤减少大量行时，逐行检查的分支开销成为瓶颈。**缓解**：可增加 selectivity gate，若检测到各列选择性接近则回退 AllAtOnce 路径。
+
+5. **Cascading 场景有条件收益**：8 列 cascading 加速 1.06x（轻微），4 列 cascading 反而退化至 0.69x。这是因为 cascading 的每列过滤率不够极端（80%→20%），per-row 分支开销抵消了部分跳过收益。
+
+6. **Adaptive 探索有效但有成本**：探索期（前10批）的平均 batch 耗时偏高，但利用期（后10批）锁定最优顺序后趋近 Best。对于典型 row group（100+ 批次）探索开销占比 <10%。
+
+7. **实际生产中 P0-1 pushdown 使用真实 dict decode，非逐行 memset**：基准测试的 `p02_decode_with_pushdown()` 使用逐行分支模拟，实际的 dict decoder 跳过机制（RLE SkipBatch + 仅解码存活行的 dict lookup）效率更高，因此实际收益可能优于基准测试数据。
+
+### 5.2 功能正确性测试方案
+
+#### 5.2.1 单元测试（建议补充）
+
+**ColumnReadOrderCtx 正确性**
+
+| 测试场景 | 验证点 |
+|----------|--------|
+| 探索期返回随机顺序 | 前10次调用 get_column_read_order() 返回不同排列 |
+| 利用期锁定最优 | 第11次起返回固定的 _best_order |
+| update 正确记录最优 | 最低 round_cost 的排列被保留为 _best_order |
+| 相同 cost 时比较 first_selectivity | selectivity 更低的排列优先 |
+| 单列场景 | 只有1列时不崩溃，顺序不变 |
+| 多列（16+）场景 | 大量列时 shuffle 和 update 正常 |
+
+**`_do_lazy_read_per_column()` 正确性**
+
+| 测试场景 | 验证点 |
+|----------|--------|
+| 单列谓词 | 与原始 `_do_lazy_read()` 结果完全一致 |
+| 多列谓词（per-col + multi-col） | 联合过滤结果正确 |
+| filter_all 场景 | 所有行被过滤，正确清空并重试 |
+| 无 per-col conjunct 的列 | 这些列正常读取，不参与中间过滤 |
+| intermediate_filter_map 传递 | 后续列确实只解码存活行 |
+
+#### 5.2.2 集成测试（建议执行）
+
+```sql
+-- 1. 多列谓词，有高选择性列
+SELECT * FROM parquet_table
+WHERE rare_col = 'UNCOMMON' AND common_col > 0;
+
+-- 2. 多列谓词，均匀选择性
+SELECT * FROM parquet_table
+WHERE col_a BETWEEN 10 AND 50 AND col_b BETWEEN 10 AND 50;
+
+-- 3. 单列谓词（应退化为原始路径）
+SELECT * FROM parquet_table WHERE id = 12345;
+
+-- 4. 无谓词（不触发 per-column 路径）
+SELECT count(*) FROM parquet_table;
+
+-- 5. 配置开关关闭时走原始路径
+-- SET enable_parquet_per_column_lazy_read = false;
+-- 重复上述查询，验证结果一致
+```
+
+### 5.3 回归测试方案
+
+#### 5.3.1 配置开关对比测试
+
+| 场景 | 配置 | 预期 |
+|------|------|------|
+| A | `enable_parquet_per_column_lazy_read = true` | 结果正确，skewed 场景性能提升 |
+| B | `enable_parquet_per_column_lazy_read = false` | 结果正确，走原始 `_do_lazy_read()` |
+
+验证：A 和 B 的查询结果完全一致（`diff` 比较）。
+
+#### 5.3.2 边界条件测试
+
+| 场景 | 描述 |
+|------|------|
+| 单谓词列 | 只有 1 个谓词列时不创建 ColumnReadOrderCtx |
+| 全部是 multi-col conjunct | 无 per-col conjunct，不触发 per-column 路径 |
+| 探索期遇到 filter_all | while 循环重试逻辑正确 |
+| batch_size 极小（1行） | per-column 路径不崩溃 |
+| 谓词列包含 dict filter 列 | 与 dict filter 机制兼容 |
+
+---
+
+## 6. 已知限制与风险
+
+### 6.1 性能限制
+
+- **Uniform 场景退化**：当所有谓词列选择性相近时，per-column 路径引入额外的 filter combine 和 intermediate_filter_map 构造开销，可能慢于 AllAtOnce 路径。
+- **探索成本**：前10批使用随机排列，可能包含较差的顺序。对于 row group 批次很少（<20）的场景，探索成本占比较大。
+- **单列谓词要求**：只有拥有单列 conjunct 的谓词列才能参与逐列过滤优化。纯多列 conjunct（如 `col_a + col_b > 100`）无法拆分。
+
+### 6.2 缓解措施
+
+| 风险 | 缓解方案 |
+|------|----------|
+| Uniform 退化 | 可增加 selectivity gate：若前几批发现所有列选择性接近（如方差 < 阈值），回退到 AllAtOnce |
+| 探索成本 | 10 轮探索 + 锁定，对于典型 row group（100+ 批次）探索开销占比 <10% |
+| 多列 conjunct | 多列 conjunct 在所有谓词列读完后统一评估，不影响正确性 |
+
+### 6.3 兼容性
+
+- **无协议变更**：仅 BE 内部读取逻辑优化
+- **向后兼容**：通过 `enable_parquet_per_column_lazy_read = false` 完全关闭
+- **与 P0-1 协同**：per-column 路径通过 `intermediate_filter_map` 向下传递已累积的过滤信息，P0-1 的 filter bitmap pushdown 在后续列的解码层生效
+
+---
+
+## 7. Benchmark 复现指南
+
+### 7.1 环境准备
+
+```bash
+# 确保 benchmark 构建目录存在
+ls be/build_benchmark/build.ninja
+
+# 增量构建
+cd be/build_benchmark && ninja -j 10 benchmark_test
+```
+
+### 7.2 运行方式
+
+```bash
+export JAVA_HOME=/path/to/jdk17
+export LD_LIBRARY_PATH="${JAVA_HOME}/lib/server:${JAVA_HOME}/lib:${LD_LIBRARY_PATH}"
+export DORIS_HOME=$(pwd)/be/build_benchmark
+
+# 运行全部 P0-2 benchmark
+./bin/benchmark_test --benchmark_filter="BM_P02_"
+
+# 运行带重复的精确测量
+./bin/benchmark_test \
+  --benchmark_filter="BM_P02_" \
+  --benchmark_repetitions=3 \
+  --benchmark_report_aggregates_only=true
+
+# 只运行核心三组对比
+./bin/benchmark_test --benchmark_filter="BM_P02_AllAtOnce|BM_P02_PerCol_NoPushdown|BM_P02_PerCol_WithPushdown"
+
+# 只运行 Ctx 开销测试
+./bin/benchmark_test --benchmark_filter="BM_P02_CtxOverhead"
+```
+
+### 7.3 Benchmark 文件说明
+
+**文件**：`be/benchmark/benchmark_column_read_order.hpp`
+
+**核心设计**：通过两种不同的 decode 模拟函数，将 P0-1（decoder 级过滤）和 P0-2（列读取顺序）的效果完全分离：
+
+- `p02_decode_no_pushdown(num_rows, cost, scratch)` — 全量解码（`memset` 全部行），用于 AllAtOnce 和 NoPushdown 组
+- `p02_decode_with_pushdown(filter, num_rows, cost, scratch)` — 仅解码存活行（逐行检查 filter），用于 WithPushdown 和 Adaptive 组
+
+每列有一个 decode cost（32 bytes/row）和一个 selectivity（决定过滤比例）。"过滤" = bitwise AND 合并过滤位图。
+
+辅助函数：
+- `p02_gen_column_filter(num_rows, selectivity, seed)` — 按给定选择率生成过滤位图
+- `p02_combine_filters(combined, col_filter, num_rows)` — bitwise AND 合并
+- `p02_count_survivors(filter, num_rows)` — 统计存活行数
+- `p02_build_sim_columns(num_rows, num_cols, costs, selectivities)` — 构建模拟列配置
+
+参数格式为 `(num_cols, num_rows_in_thousands, scenario)`，其中 scenario: 0=skewed, 1=uniform, 2=cascading。
+
+---
+
+## 8. 总结
+
+P0-2 优化通过逐列读取谓词列 + 中间过滤 + 自适应排序，**与 P0-1（Filter Bitmap Pushdown）协同**，在存在高选择性谓词列的场景下显著减少了后续列的解码量。
+
+### 核心发现
+
+**P0-2 是 P0-1 的放大器，二者必须协同才能产生显著收益。**
+
+- **P0-2 单独**（NoPushdown）：与 AllAtOnce 基线相比差异在噪声范围内（±1.6%）。逐列读取改变了 filter 评估时机，但 decoder 仍解码全部行，无法减少主要工作量。
+- **P0-2 + P0-1**（WithPushdown）：8 列 skewed 场景加速 **1.41x**。高选择性列先读后，P0-1 令 decoder 跳过大量行，后续列解码量骤降。
+
+### 关键数据
+
+1. **P0-2 + P0-1 synergy** — 8 列 skewed 场景：AllAtOnce 1260 µs → WithPushdown Best 893 µs，加速 1.41x
+2. **列顺序影响** — 8 列 skewed 场景：Best 893 µs vs Worst 3670 µs，差距 **4.11 倍**，充分证明自适应排序的必要性
+3. **ColumnReadOrderCtx 自适应排序** — 10 轮探索自动找到接近最优的列顺序，管理开销 ~1.8 µs/batch 可忽略
+4. **Uniform 场景退化** — WithPushdown 的逐行分支开销在无大量行可跳过时成为瓶颈，4 列 uniform 退化至 0.45x。需通过 selectivity gate 回退 AllAtOnce 路径
+
+### 架构意义
+
+P0-2 的逐列读取 + 中间过滤为 P0-1 的 decoder 级跳过提供了前置条件（intermediate_filter_map），形成了完整的"**逐列过滤 → 累积 filter → decoder 跳过 → 下一列更少行**"优化链路。单独使用任何一个优化效果有限，组合使用才能发挥最大威力。
+
+### 生产安全
+
+- 运行时可调配置 `enable_parquet_per_column_lazy_read`，可随时关闭回退原始路径
+- Uniform 场景可通过后续 selectivity gate 进一步优化（检测各列选择性方差，若接近则回退 AllAtOnce）
diff --git a/docs/P0-3_Lazy_Dict_Decode_Test_Report.md b/docs/P0-3_Lazy_Dict_Decode_Test_Report.md
new file mode 100644
index 00000000000000..230e189b7a468d
--- /dev/null
+++ b/docs/P0-3_Lazy_Dict_Decode_Test_Report.md
@@ -0,0 +1,547 @@
+# P0-3 惰性列字典延迟解码优化 — 测试文档
+
+## 1. 功能概述
+
+本优化为 Doris Parquet Reader 的 P0-3 优化项：**惰性列字典延迟解码（Lazy Dictionary Decode for Lazy String Columns）**，实现了 Phase 2 惰性字符串列的"先解码为字典索引 int32，过滤后再转换为字符串"策略。
+
+### 1.1 优化目标
+
+在 lazy read 模式下，Phase 2 读取的惰性字符串列（不参与谓词过滤）需要全量解码为字符串。当 Phase 1 的过滤率较高时，大量被过滤行的字符串解码是浪费。P0-3 优化将这些列的解码分为两步：
+- **Step 1**：以字典索引（int32）形式读取全部行——写 4 字节整数远快于字典查找 + 字符串拷贝
+- **Step 2**：Phase 1 过滤完成后，仅对存活行执行字典索引 → 字符串的转换
+
+### 1.2 核心对比
+
+| | 原始路径（Eager String） | 优化路径（Lazy Dict Decode） |
+|---|---|---|
+| Phase 2 解码 | 全部 N 行解码为字符串（dict lookup + string copy） | 全部 N 行解码为 int32（写 4 字节） |
+| 过滤后处理 | 直接 filter 字符串列 | 先 filter int32 列，再 `convert_dict_column_to_string_column` 仅 S 行 |
+| 内存占用 | N × avg_str_len | N × 4 + S × avg_str_len |
+| 适用条件 | 通用 | 列必须全字典编码（PLAIN_DICTIONARY / RLE_DICTIONARY） |
+
+### 1.3 与 P0-1 的关系
+
+P0-1（Filter Bitmap Pushdown）的懒惰解码路径 `_lazy_decode_string_values()` 对字符串列实际有**负面效果**（比基线慢 24-152%），因为 per-RLE-run 的 `GetBatch` + `SkipBatch` 开销大于一次性全量 `GetBatch`。P0-3 是字符串惰性列的正确优化策略，通过改变数据类型（string → int32）而非改变解码粒度来避免无效字符串物化。
+
+---
+
+## 2. 修改文件清单
+
+### 2.1 核心修改
+
+| 文件 | 修改内容 | 重要程度 |
+|------|----------|----------|
+| `be/src/vec/exec/format/parquet/vparquet_reader.cpp` | 候选列识别：遍历惰性列，检查字符串 slot 类型 + BYTE_ARRAY 物理类型，加入 `lazy_dict_decode_candidates` | 高 |
+| `be/src/vec/exec/format/parquet/vparquet_group_reader.h` | 新增 `lazy_dict_decode_candidates` 字段（LazyReadContext）、`_lazy_dict_decode_cols` 成员、`_convert_lazy_dict_cols_to_string_cols()` 声明 | 高 |
+| `be/src/vec/exec/format/parquet/vparquet_group_reader.cpp` | Row group 级确认（`is_dictionary_encoded`）、`_read_column_data()` 中替换为 ColumnInt32、Phase 2 后调用转换函数、`_convert_lazy_dict_cols_to_string_cols()` 实现 (~56 行) | 高 |
+| `be/src/common/config.h` | 新增 `enable_parquet_lazy_dict_decode_for_lazy_columns` 配置项 | 中 |
+| `be/src/common/config.cpp` | 对应定义 | 中 |
+
+### 2.2 新增文件
+
+| 文件 | 说明 |
+|------|------|
+| `be/benchmark/benchmark_lazy_dict_decode.hpp` | 微基准测试：4 种策略对比 + 转换开销测量 |
+
+### 2.3 已修改文件（其他 P0 共享）
+
+| 文件 | P0-3 相关修改 |
+|------|--------------|
+| `be/benchmark/benchmark_main.cpp` | 新增 `#include "benchmark_lazy_dict_decode.hpp"` |
+
+---
+
+## 3. 配置项
+
+### 3.1 `enable_parquet_lazy_dict_decode_for_lazy_columns`
+
+| 属性 | 值 |
+|------|-----|
+| 类型 | mBool（运行时可修改） |
+| 默认值 | `true` |
+| 作用 | 控制是否对 Phase 2 惰性字符串列启用字典延迟解码 |
+| 关闭方式 | `curl http://be_host:webserver_port/api/update_config?enable_parquet_lazy_dict_decode_for_lazy_columns=false` |
+
+### 3.2 触发条件
+
+惰性列字典延迟解码在同时满足以下条件时激活：
+1. `enable_parquet_lazy_dict_decode_for_lazy_columns = true`
+2. 惰性列的 slot 类型为字符串类型（`TYPE_STRING` / `TYPE_VARCHAR` / `TYPE_CHAR`）
+3. 列的 Parquet 物理类型为 `BYTE_ARRAY`
+4. 当前 row group 中该列全字典编码（通过 `is_dictionary_encoded()` 检查 `encoding_stats` 或 `encodings` 元数据）
+
+---
+
+## 4. 技术实现细节
+
+### 4.1 候选列识别
+
+**位置**：`vparquet_reader.cpp::set_fill_columns()` (~line 564-599)
+
+遍历所有惰性列，筛选满足条件的候选列：
+
+```
+for each lazy_column:
+  slot_type = slot_desc->type().type
+  if slot_type in {TYPE_STRING, TYPE_VARCHAR, TYPE_CHAR}:
+    parquet_col = find_column_in_schema(lazy_column.name)
+    if parquet_col.physical_type == BYTE_ARRAY:
+      lazy_read_ctx.lazy_dict_decode_candidates.push_back({col_name, slot_id})
+```
+
+### 4.2 Row Group 级确认
+
+**位置**：`vparquet_group_reader.cpp::init()` (~line 254-268)
+
+在每个 row group 初始化时，逐一检查候选列的编码类型：
+
+```
+for each candidate in lazy_dict_decode_candidates:
+  column_metadata = get_column_metadata(candidate.name)
+  if is_dictionary_encoded(column_metadata):
+    _lazy_dict_decode_cols.push_back(candidate)
+```
+
+`is_dictionary_encoded()` 检查逻辑（line 367-425）：
+- **优先检查 `encoding_stats`**（Parquet v2.6+）：要求所有 `DATA_PAGE` / `DATA_PAGE_V2` 的编码为 `PLAIN_DICTIONARY` 或 `RLE_DICTIONARY`
+- **回退检查 `encodings`**：排除 `PLAIN_DICTIONARY` / `RLE_DICTIONARY` / `RLE`（用于定义级别）后，确认无其他编码
+
+### 4.3 Phase 2 读取为 int32
+
+**位置**：`vparquet_group_reader.cpp::_read_column_data()` (~line 568-594)
+
+在 `_dict_filter_cols` 检查循环之后，新增对 `_lazy_dict_decode_cols` 的检查。匹配到的列执行与 dict filter 列相同的类型替换：
+
+```
+for col in _lazy_dict_decode_cols:
+  if block.column_name == col.name:
+    // 替换列为 ColumnInt32 + DataTypeInt32
+    replace_column_with_int32(block[i])
+    is_dict_filter = true  // 使 decoder 输出 dict indices
+    break
+```
+
+### 4.4 过滤后字典转换
+
+**位置**：`vparquet_group_reader.cpp::_convert_lazy_dict_cols_to_string_cols()` (~line 1384-1435)
+
+在 Phase 2 完成过滤后，将 int32 字典索引列转换回字符串：
+
+```
+for col in _lazy_dict_decode_cols:
+  find column in block by slot_id
+  if column is empty (all rows filtered):
+    restore original string type with empty column
+    continue
+  
+  if column is nullable:
+    extract nested ColumnInt32 from nullable wrapper
+    convert_dict_column_to_string_column(int32_col) → string_col
+    re-wrap with nullable
+  else:
+    convert_dict_column_to_string_column(int32_col) → string_col
+  
+  replace column in block
+```
+
+`convert_dict_column_to_string_column()` 由 `ByteArrayDictDecoder` 提供，对每个 int32 索引执行字典查找，构建 `ColumnString`。
+
+### 4.5 调用链路
+
+```
+ParquetReader::set_fill_columns()
+  → 识别 lazy_dict_decode_candidates（字符串 + BYTE_ARRAY）
+
+RowGroupReader::init()
+  → 逐列检查 is_dictionary_encoded()
+  → 确认 _lazy_dict_decode_cols
+
+RowGroupReader::_do_lazy_read()  /  _do_lazy_read_per_column()
+  Phase 1: 读取谓词列 → 过滤
+  Phase 2: _read_column_data() 
+    → _lazy_dict_decode_cols 匹配的列替换为 ColumnInt32
+    → decoder 输出 dict indices（4 字节/行）
+  → filter Phase 2 列
+  → _convert_dict_cols_to_string_cols()     // 谓词列的 dict filter 转换
+  → _convert_lazy_dict_cols_to_string_cols() // 惰性列的字典转换（仅存活行）
+```
+
+---
+
+## 5. 测试方案
+
+### 5.1 微基准测试（已完成）
+
+#### 5.1.1 构建与运行
+
+```bash
+# 增量构建
+cd be/build_benchmark && ninja -j 10 benchmark_test
+
+# 运行 P0-3 benchmark
+export JAVA_HOME=/path/to/jdk17
+export LD_LIBRARY_PATH="${JAVA_HOME}/lib/server:${JAVA_HOME}/lib:${LD_LIBRARY_PATH}"
+export DORIS_HOME=$(pwd)/be/build_benchmark
+./bin/benchmark_test --benchmark_filter="BM_P03_" --benchmark_repetitions=3 --benchmark_report_aggregates_only=true
+```
+
+#### 5.1.2 基准测试用例一览
+
+基准测试使用**真实 `ByteArrayDictDecoder`**（非模拟），对比 4 种策略：
+
+| 组别 | 测试名 | P0-1 | P0-3 | 机制 |
+|------|--------|------|------|------|
+| 基线 | `BM_P03_Baseline` | 否 | 否 | `decode_values(ColumnString, is_dict_filter=false, filter_data=nullptr)` — 全部行解码为字符串 |
+| P0-1 Only | `BM_P03_P01Only` | 是 | 否 | `decode_values(ColumnString, is_dict_filter=false, filter_data=bitmap)` — 懒惰解码仅存活行为字符串 |
+| P0-3 Only | `BM_P03_P03Only` | 否 | 是 | `decode_values(ColumnInt32, is_dict_filter=true, filter_data=nullptr)` → filter int32 → `convert_dict_column_to_string_column` 仅存活行 |
+| P0-3+P0-1 | `BM_P03_P03PlusP01` | 是 | 是 | `decode_values(ColumnInt32, is_dict_filter=true, filter_data=bitmap)` → `convert_dict_column_to_string_column` |
+| 辅助 | `BM_P03_ConvertOverhead` | — | — | 纯 `convert_dict_column_to_string_column` 开销测量 |
+
+参数格式：`(dict_size, selectivity_percent, num_values_in_thousands, avg_str_len)`
+
+**P0-3+P0-1 的代码路径细节**：当 `is_dict_filter=true` 且 `filter_data!=nullptr` 时，`byte_array_dict_decoder.cpp` 的 `_decode_values<true>()` 路径**不使用** `_lazy_decode_string_values()`，而是走 bulk `GetBatch` 解码全部 RLE 索引，然后 `_decode_dict_values<true>` 通过 `ColumnSelectVector` 仅写入 CONTENT 行的 int32 值到 `ColumnInt32`。因此 P0-3+P0-1 仍解码全部 RLE 索引，但写入更少的 int32 值 + 转换更少的字符串。
+
+#### 5.1.3 基准测试结果
+
+**测试环境**：16 核 CPU @ 3437.92 MHz，L1D 48KB×8, L2 1280KB×8, L3 49152KB×1
+
+##### 小字典（dict=100），短字符串（strlen=32），100K 行
+
+| 存活率 | Baseline (µs) | P0-1 Only (µs) | P0-3 Only (µs) | P0-3+P0-1 (µs) | 最优策略 |
+|--------|---------------|-----------------|-----------------|-----------------|----------|
+| 5% | 261 | 337 | **167** | 180 | P0-3 Only (1.56x) |
+| 10% | 426 | 569 | **248** | 268 | P0-3 Only (1.72x) |
+| 20% | 723 | 1000 | **437** | 450 | P0-3 Only (1.65x) |
+| 50% | 1328 | 1764 | 1014 | **856** | P0-3+P0-1 (1.55x) |
+| 100% | **501** | 509 | 744 | 507 | Baseline (无过滤) |
+
+##### 小字典（dict=100），长字符串（strlen=128），100K 行
+
+| 存活率 | Baseline (µs) | P0-1 Only (µs) | P0-3 Only (µs) | P0-3+P0-1 (µs) | 最优策略 |
+|--------|---------------|-----------------|-----------------|-----------------|----------|
+| 5% | 296 | 375 | **168** | 213 | P0-3 Only (1.76x) |
+| 20% | 804 | 1093 | **460** | 561 | P0-3 Only (1.75x) |
+| 50% | 3781 | 4522 | **1162** | 3151 | P0-3 Only (3.25x) |
+| 100% | 5280 | **5155** | 5598 | 5717 | P0-1 Only ≈ Baseline |
+
+##### 中字典（dict=10000），短字符串（strlen=32），100K 行
+
+| 存活率 | Baseline (µs) | P0-1 Only (µs) | P0-3 Only (µs) | P0-3+P0-1 (µs) | 最优策略 |
+|--------|---------------|-----------------|-----------------|-----------------|----------|
+| 5% | 325 | 355 | **185** | 214 | P0-3 Only (1.76x) |
+| 20% | 826 | 1048 | **467** | 486 | P0-3 Only (1.77x) |
+| 50% | 1451 | 1855 | 1043 | **919** | P0-3+P0-1 (1.58x) |
+
+##### 中字典（dict=10000），长字符串（strlen=128），100K 行
+
+| 存活率 | Baseline (µs) | P0-1 Only (µs) | P0-3 Only (µs) | P0-3+P0-1 (µs) | 最优策略 |
+|--------|---------------|-----------------|-----------------|-----------------|----------|
+| 5% | 389 | 474 | **191** | 254 | P0-3 Only (2.04x) |
+| 20% | 2341 | 1266 | **498** | 665 | P0-3 Only (4.70x) |
+
+##### 转换开销（`convert_dict_column_to_string_column`）
+
+| 字典大小 | 行数 | 字符串长度 | 耗时 (µs) |
+|---------|------|-----------|-----------|
+| 100 | 5K | 32 | 17.7 |
+| 100 | 50K | 32 | 201 |
+| 100 | 100K | 32 | 437 |
+| 100 | 5K | 128 | 36.2 |
+| 100 | 100K | 128 | 5243 |
+| 10000 | 5K | 32 | 20.6 |
+| 10000 | 100K | 32 | 501 |
+| 10000 | 5K | 128 | 55.6 |
+| 10000 | 100K | 128 | 6160 |
+
+#### 5.1.4 性能分析
+
+**核心发现：P0-3 是惰性字符串列的最优策略，全面优于 P0-1 和基线。**
+
+##### 1. P0-1 Only 对字符串列有负面效果
+
+| 场景 | Baseline (µs) | P0-1 Only (µs) | 差异 |
+|------|---------------|-----------------|------|
+| dict=100, strlen=32, sel=5% | 261 | 337 | **+29% 退化** |
+| dict=100, strlen=32, sel=10% | 426 | 569 | **+34% 退化** |
+| dict=100, strlen=128, sel=5% | 296 | 375 | **+27% 退化** |
+| dict=100, strlen=128, sel=50% | 3781 | 4522 | **+20% 退化** |
+
+**原因分析**：P0-1 的 `_lazy_decode_string_values()` 按 RLE run 粒度处理，每个 CONTENT run 独立调用 `GetBatch` + `insert_many_strings_overflow`，每个 FILTERED_CONTENT run 调用 `SkipBatch`。这种 per-run 开销累积显著大于原始路径的一次性 `GetBatch`（全量索引解码） + 遍历 ColumnSelectVector（仅 CONTENT 行做字典查找）。字符串物化成本（字典查找 + 字符串拷贝）在两条路径中相同，而 P0-1 增加了额外的 per-run 管理开销。
+
+**结论**：P0-1 的 filter bitmap pushdown **不应应用于**字符串惰性列。当前代码正确处理了这一点——`is_dict_filter=true` 时 `_decode_values` 不会进入 `_lazy_decode_string_values` 路径。
+
+##### 2. P0-3 Only 在全部 <100% 选择率下全面领先
+
+| 场景 | vs Baseline 加速比 | 关键优势 |
+|------|-------------------|----------|
+| dict=100, strlen=32, sel=5% | 1.56x | int32 解码远快于字符串 |
+| dict=100, strlen=32, sel=10% | 1.72x | |
+| dict=100, strlen=128, sel=5% | 1.76x | 长字符串放大优势 |
+| dict=100, strlen=128, sel=50% | **3.25x** | 50% 行的字符串物化节省巨大 |
+| dict=10000, strlen=128, sel=5% | 2.04x | 大字典 + 长字符串 |
+| dict=10000, strlen=128, sel=20% | **4.70x** | **最大加速比** |
+
+**核心机制**：解码 N 行 int32（写 4 字节/行）的成本约为解码 N 行字符串（字典查找 + 字符串拷贝 avg_str_len 字节/行）的 1/3 ~ 1/10。即使 P0-3 仍解码全部 N 行为 int32，总成本 = (N × int32 decode cost) + (S × string convert cost) 远小于 (N × string decode cost)，只要 S << N。
+
+##### 3. 长字符串显著放大 P0-3 优势
+
+| 存活率 | strlen=32 加速比 | strlen=128 加速比 | 放大倍数 |
+|--------|----------------|------------------|----------|
+| 5%, dict=100 | 1.56x | 1.76x | 1.13x |
+| 50%, dict=100 | 1.31x (P0-3 Only) | **3.25x** | 2.48x |
+| 20%, dict=10000 | 1.77x | **4.70x** | 2.65x |
+
+字符串越长，每行字符串物化成本越高，P0-3 的"延迟到过滤后再物化"策略收益越大。
+
+##### 4. P0-3+P0-1 在低选择率时略逊于 P0-3 Only，50% 时反超
+
+| 存活率 | P0-3 Only (µs) | P0-3+P0-1 (µs) | 差异 |
+|--------|-----------------|-----------------|------|
+| 5%, dict=100, strlen=32 | 167 | 180 | P0-3+P0-1 慢 8% |
+| 20%, dict=100, strlen=32 | 437 | 450 | P0-3+P0-1 慢 3% |
+| 50%, dict=100, strlen=32 | 1014 | **856** | P0-3+P0-1 快 16% |
+| 50%, dict=10000, strlen=32 | 1043 | **919** | P0-3+P0-1 快 12% |
+
+**原因**：当 `is_dict_filter=true` 且 `filter_data!=nullptr` 时，P0-1 通过 ColumnSelectVector 跳过 FILTERED_CONTENT 行的 int32 写入。在低选择率下，节省的 int32 写入量很少（int32 写入本身就很廉价），但 ColumnSelectVector 的 per-run 处理开销使总成本略增。在 50% 选择率下，跳过的 int32 写入量足够多，收益超过了开销。
+
+##### 5. 100% 选择率时 P0-3 有退化
+
+| 场景 | Baseline (µs) | P0-3 Only (µs) | 退化比例 |
+|------|---------------|-----------------|----------|
+| dict=100, strlen=32, sel=100% | 501 | 744 | **+49%** |
+| dict=100, strlen=128, sel=100% | 5280 | 5598 | +6% |
+
+**原因**：100% 选择率时无行被过滤，P0-3 的解码路径为 (N × int32 decode) + (N × string convert)，比直接 (N × string decode) 多了一次完整的数据遍历。短字符串时退化更明显（因为 string decode 的绝对成本较低，额外遍历开销占比更大）。
+
+**缓解方案**：建议增加选择率门控，当 `filter_ratio < 0.05`（即存活率 > 95%）时禁用 P0-3，回退到直接字符串解码。
+
+##### 6. 转换开销线性扩展
+
+`convert_dict_column_to_string_column` 的开销与 `行数 × 字符串长度` 成线性关系：
+- 100K 行 × strlen=32：~437-501 µs
+- 100K 行 × strlen=128：~5243-6160 µs
+- 5K 行 × strlen=32：~17-21 µs
+
+这是存活行的主要成本。P0-3 的优势在于将此成本从 N 行降低到 S 行（S = 存活行数）。
+
+### 5.2 功能正确性测试方案
+
+#### 5.2.1 单元测试（建议补充）
+
+**候选列识别正确性**
+
+| 测试场景 | 验证点 |
+|----------|--------|
+| 字符串惰性列 + BYTE_ARRAY | 被正确加入 candidates |
+| 非字符串惰性列（INT、DOUBLE） | 不加入 candidates |
+| 字符串谓词列（非惰性） | 不加入 candidates |
+| FIXED_LEN_BYTE_ARRAY 字符串列 | 不加入 candidates（仅 BYTE_ARRAY） |
+
+**Row group 级确认正确性**
+
+| 测试场景 | 验证点 |
+|----------|--------|
+| 全字典编码列 | 加入 `_lazy_dict_decode_cols` |
+| 混合编码列（部分 page 用 PLAIN） | 不加入 |
+| encoding_stats 存在时 | 优先使用 encoding_stats 判断 |
+| 仅 encodings 字段时 | 回退到 encodings 检查 |
+
+**`_convert_lazy_dict_cols_to_string_cols()` 正确性**
+
+| 测试场景 | 验证点 |
+|----------|--------|
+| 非 nullable 列 | int32 → string 转换正确 |
+| nullable 列 | null bitmap 保留，非 null 行正确转换 |
+| 全部被过滤（空列） | 正确恢复字符串类型，列为空 |
+| 字典索引为 0（第一个字典项） | 不会被误判为空/null |
+| 大字典（10000+ 条目） | 全部索引正确映射 |
+
+#### 5.2.2 集成测试（建议执行）
+
+```sql
+-- 1. 基础查询：惰性字符串列在过滤后正确返回
+SELECT name, address FROM parquet_table WHERE id = 12345;
+
+-- 2. 多字符串惰性列
+SELECT col_str_a, col_str_b, col_str_c FROM parquet_table 
+WHERE int_col BETWEEN 1 AND 10;
+
+-- 3. 字符串列包含 null 值
+SELECT nullable_str_col FROM parquet_table WHERE status = 'ACTIVE';
+
+-- 4. 高选择率（验证 P0-3 不引入额外开销）
+SELECT name FROM parquet_table WHERE id > 0;
+
+-- 5. 无过滤条件（不触发 lazy read）
+SELECT count(*) FROM parquet_table;
+
+-- 6. 配置开关关闭时走原始路径
+-- curl ...update_config?enable_parquet_lazy_dict_decode_for_lazy_columns=false
+-- 重复上述查询，验证结果一致
+
+-- 7. 混合编码列（部分 row group 非字典编码）
+-- 验证该列在非字典编码 row group 中回退到直接字符串解码
+```
+
+**外表类型覆盖**：
+- Hive 外表（Parquet 格式）
+- Iceberg 外表（Parquet 格式）
+- 直接 `SELECT * FROM S3()` 读取 Parquet 文件
+
+### 5.3 回归测试方案
+
+#### 5.3.1 配置开关对比测试
+
+| 场景 | 配置 | 预期 |
+|------|------|------|
+| A | `enable_parquet_lazy_dict_decode_for_lazy_columns = true` | 结果正确，有过滤时性能提升 |
+| B | `enable_parquet_lazy_dict_decode_for_lazy_columns = false` | 结果正确，走原始字符串解码路径 |
+
+验证：A 和 B 的查询结果完全一致（`diff` 比较）。
+
+#### 5.3.2 边界条件测试
+
+| 场景 | 描述 |
+|------|------|
+| 空 Page | 0 行数据的 Parquet page |
+| 全 null 字符串列 | 所有行都是 null，无 int32 索引需转换 |
+| 全非 null 列 | 无 null 值 |
+| 单行 Page | 每个 page 只有 1 行 |
+| 字典仅 1 个条目 | 极端小字典 |
+| 字典很大（100K+条目） | 验证 convert 开销在预期范围内 |
+| 空字符串值 | 字典中包含 "" 的情况 |
+| 超长字符串值（64KB+） | 验证内存分配正确 |
+| filter_all 场景 | Phase 1 全部行被过滤，Phase 2 不读取惰性列 |
+| 跨 row group 切换 | 验证 `_lazy_dict_decode_cols` 在每个 row group 重新确认 |
+| 某个 row group 非字典编码 | 该 row group 回退直接解码，不影响其他 row group |
+
+---
+
+## 6. 已知限制与风险
+
+### 6.1 性能限制
+
+- **100% 选择率退化**：当无行被过滤时，P0-3 多了一次数据遍历（int32 解码 + 全量 convert），比直接字符串解码慢 6-49%。建议增加选择率门控（filter_ratio < 0.05 时禁用）。
+- **P0-1 对字符串列有害**：P0-1 的 `_lazy_decode_string_values()` 在字符串列上比基线慢 20-34%。P0-3 惰性列不应使用 P0-1 的 filter bitmap pushdown。当前代码正确处理了这一点（`is_dict_filter=true` 时不进入 lazy decode 路径）。
+- **仅适用于全字典编码列**：如果某列在部分 page 使用 PLAIN 编码（fallback），该列在该 row group 不会启用 P0-3。
+- **转换开销与字符串长度正相关**：strlen=128 时 100K 行的转换开销达 ~5-6 ms，占总时间比例较大。对于超长字符串（如 JSON/XML 存储），存活行的转换成本可能成为瓶颈。
+
+### 6.2 兼容性
+
+- **无协议变更**：仅 BE 内部解码逻辑优化，不涉及存储格式、网络协议、FE 变更。
+- **向后兼容**：通过配置开关 `enable_parquet_lazy_dict_decode_for_lazy_columns = false` 完全关闭。
+- **与 P0-1 的关系**：P0-3 列以 `is_dict_filter=true` 模式读取，decoder 直接输出 int32 字典索引。P0-1 的 filter_data 即使被传递，也不会进入 `_lazy_decode_string_values()` 路径——而是走 bulk GetBatch + ColumnSelectVector 路径，仅影响 int32 写入量。
+- **与 P0-2 的兼容**：P0-3 惰性列在 Phase 2 读取，不参与 P0-2 的谓词列排序。两者完全正交。
+
+### 6.3 潜在风险
+
+| 风险 | 影响 | 缓解措施 |
+|------|------|----------|
+| `is_dictionary_encoded()` 误判 | 非字典编码列被当作字典列读取，decode 输出错误数据 | 已使用 Parquet 标准的 `encoding_stats` / `encodings` 元数据，与已有 `_dict_filter_cols` 使用相同检查逻辑 |
+| 空列转换崩溃 | 全部行被过滤后 ColumnInt32 为空，convert 可能越界 | 已在 `_convert_lazy_dict_cols_to_string_cols()` 中特殊处理空列情况 |
+| nullable 列 unwrap 错误 | 从 ColumnNullable 中错误提取内部列 | 使用与 `_convert_dict_cols_to_string_cols()` 相同的 nullable 处理逻辑 |
+| 选择率门控缺失 | 100% 选择率时性能退化 | 建议后续增加 filter_ratio 门控 |
+
+---
+
+## 7. Benchmark 复现指南
+
+### 7.1 环境准备
+
+```bash
+# 确保 benchmark 构建目录存在
+ls be/build_benchmark/build.ninja
+
+# 增量构建
+cd be/build_benchmark && ninja -j 10 benchmark_test
+```
+
+### 7.2 运行方式
+
+```bash
+export JAVA_HOME=/path/to/jdk17
+export LD_LIBRARY_PATH="${JAVA_HOME}/lib/server:${JAVA_HOME}/lib:${LD_LIBRARY_PATH}"
+export DORIS_HOME=$(pwd)/be/build_benchmark
+
+# 运行全部 P0-3 benchmark
+./bin/benchmark_test --benchmark_filter="BM_P03_"
+
+# 运行带重复的精确测量
+./bin/benchmark_test \
+  --benchmark_filter="BM_P03_" \
+  --benchmark_repetitions=3 \
+  --benchmark_report_aggregates_only=true
+
+# 只运行核心四组对比（排除 ConvertOverhead）
+./bin/benchmark_test --benchmark_filter="BM_P03_Baseline|BM_P03_P01Only|BM_P03_P03Only|BM_P03_P03PlusP01"
+
+# 只运行转换开销测试
+./bin/benchmark_test --benchmark_filter="BM_P03_ConvertOverhead"
+
+# 只运行特定字典大小
+./bin/benchmark_test --benchmark_filter="BM_P03_.*/100/"
+
+# 只运行长字符串测试
+./bin/benchmark_test --benchmark_filter="BM_P03_.*/128"
+```
+
+### 7.3 Benchmark 文件说明
+
+**文件**：`be/benchmark/benchmark_lazy_dict_decode.hpp`
+
+**核心设计**：使用真实 `ByteArrayDictDecoder` 实例，通过控制 `is_dict_filter`、`filter_data` 两个参数组合隔离 P0-1 和 P0-3 的效果：
+
+| 参数组合 | is_dict_filter | filter_data | 策略 |
+|----------|---------------|-------------|------|
+| Baseline | false | nullptr | 原始全量字符串解码 |
+| P0-1 Only | false | bitmap | 懒惰字符串解码（per-run） |
+| P0-3 Only | true | nullptr | 全量 int32 解码 + convert |
+| P0-3+P0-1 | true | bitmap | int32 解码（skip FILTERED_CONTENT） + convert |
+
+辅助函数：
+- 复用 `be/benchmark/benchmark_parquet_dict_decoder.hpp` 中的：
+  - `build_string_dict(dict_size, avg_str_len)` — 构建 ByteArray 字典
+  - `build_rle_dict_indexes(num_values, dict_size)` — 生成 RLE 编码字典索引
+  - `build_run_length_null_map(num_values)` — 构建无 null 的 run length null map
+  - `build_filter_bitmap(num_values, selectivity)` — 生成过滤位图
+
+参数格式为 `(dict_size, selectivity_percent, num_values_in_thousands, avg_str_len)`。
+
+---
+
+## 8. 总结
+
+P0-3 优化通过将惰性字符串列的解码分为"int32 字典索引解码"和"过滤后字符串转换"两步，在 Phase 2 有过滤的场景下显著减少了字符串物化开销。
+
+### 核心发现
+
+**P0-3 是惰性字符串列的最优策略，全面且大幅优于 P0-1 和基线。**
+
+#### 关键数据
+
+1. **P0-3 Only 最高加速 4.70x**：dict=10000, strlen=128, sel=20% 场景，Baseline 2341 µs → P0-3 Only 498 µs
+2. **P0-3 Only 在全部 <100% 选择率下均优于基线**：加速范围 1.56x ~ 4.70x
+3. **长字符串显著放大优势**：strlen=128 时加速比是 strlen=32 的 1.1x ~ 2.65x
+4. **P0-1 对字符串列有害**：比基线慢 20-34%，不应用于字符串惰性列
+5. **P0-3+P0-1 仅在 50% 选择率时优于 P0-3 Only**：低选择率时 P0-3 Only 更优
+6. **100% 选择率退化 6-49%**：需增加选择率门控
+
+#### 策略选择建议
+
+| 场景 | 推荐策略 |
+|------|----------|
+| 惰性字符串列 + 有过滤（存活率 < 95%） | **P0-3 Only**（禁用 P0-1 pushdown） |
+| 惰性字符串列 + 无/弱过滤（存活率 ≥ 95%） | 原始路径（禁用 P0-3） |
+| 谓词定长列（INT/FLOAT/DOUBLE） | P0-1（Filter Bitmap Pushdown） |
+| 谓词字符串列 | 已有 dict filter 机制处理 |
+
+### 架构意义
+
+P0-3 揭示了一个重要设计原则：**对于字符串列，改变数据类型（string → int32）比改变解码粒度（全量 → per-run）更有效**。P0-1 的 per-run SkipBatch 在定长类型（INT32/INT64/FLOAT/DOUBLE）上有效，但在变长类型（字符串）上因 per-run 开销而退化。P0-3 通过将变长问题转化为定长问题（int32），完美规避了这一瓶颈。
+
+### 生产安全
+
+- 运行时可调配置 `enable_parquet_lazy_dict_decode_for_lazy_columns`，可随时关闭回退原始路径
+- 每个 row group 独立确认字典编码状态，非字典编码列自动回退
+- 与已有的 `_dict_filter_cols` 机制共享类型替换和转换逻辑，代码复用度高
+- 建议后续增加 `filter_ratio` 门控（存活率 > 95% 时禁用），消除 100% 选择率退化
diff --git a/docs/P1_Decoder_Optimizations_Test_Report.md b/docs/P1_Decoder_Optimizations_Test_Report.md
new file mode 100644
index 00000000000000..224f7e6122580c
--- /dev/null
+++ b/docs/P1_Decoder_Optimizations_Test_Report.md
@@ -0,0 +1,340 @@
+# P1 解码器优化测试报告
+
+## 测试环境
+
+| 项目 | 值 |
+|------|-------|
+| CPU | 16 核 × 3.44 GHz (Intel 第 12 代, Alder Lake) |
+| L1 数据缓存 | 48 KiB × 8 |
+| L1 指令缓存 | 32 KiB × 8 |
+| L2 缓存 | 1280 KiB × 8 |
+| L3 缓存 | 49152 KiB (共享) |
+| 构建模式 | Release (-O3 -DNDEBUG) |
+| 编译器 | Clang (ldb_toolchain) |
+| SIMD | AVX2 已启用 (-mavx2) |
+| 基准测试框架 | Google Benchmark, 5 次重复, 仅输出聚合结果 |
+
+## 测试方法
+
+### 独立测试组（解耦配置）
+
+P1-4 (SIMD) 和 P1-5 (Prefetch) 由**独立**的配置开关控制，支持单独和组合评估：
+
+| 测试组 | SIMD (`enable_parquet_simd_dict_decode`) | Prefetch (`enable_parquet_dict_prefetch`) | 说明 |
+|--------|:---:|:---:|-------------|
+| **A（基线）** | 关 | 关 | 纯标量循环 — 无优化 |
+| **B（仅 P1-4）** | 开 | 关 | AVX2 SIMD gather，无软件预取 |
+| **C（仅 P1-5）** | 关 | 开 | 标量循环 + 大字典软件预取 |
+| **D（P1-4+P1-5）** | 开 | 开 | SIMD gather + 软件预取组合 |
+| **E（P1-6）** | 不适用 | 不适用 | Plain 编码 memcpy 快速路径（独立） |
+
+每个测试组合使用 `ConfigGuard` RAII 来设置/恢复配置，确保测试间完全隔离。
+
+所有测量使用 5 次重复的**中位数 CPU 时间**以减少噪声。
+
+### 参数
+
+- **字典解码器（A-D 组）**：dict_size ∈ {100, 10K, 1M} × rows ∈ {100K, 500K}
+  - dict=100：可放入 L1 缓存（INT32 占 400B，INT64 占 800B）
+  - dict=10K：可放入 L2 缓存（INT32 占 40KB，INT64 占 80KB）
+  - dict=1M：超出 L2 缓存（INT32 占 4MB，INT64 占 8MB）
+- **Plain 解码器（E 组）**：type_length ∈ {4, 8} × rows ∈ {100K, 500K, 1M}
+
+---
+
+## P1-4：AVX2 SIMD 字典 Gather（B 组 vs A 组）
+
+### 描述
+
+将标量字典查找循环替换为 AVX2 SIMD gather 指令：
+- **INT32/FLOAT**：`_mm256_i32gather_epi32` — 每条指令处理 8 个值
+- **INT64/DOUBLE**：`_mm256_i32gather_epi64` — 每条指令处理 4 个值
+- **String**：将每次 run 的 `vector<StringRef>` 堆分配替换为可复用的类成员 `_string_values_buf`
+
+### INT32 结果（B vs A）
+
+| 字典大小 | 行数 | A:基线 (us) | B:SIMD (us) | 差值 | 加速比 |
+|-----------|------|----------------:|------------:|------:|--------:|
+| 100 | 100K | 129 | 204 | +75 | **-58.1%** |
+| 100 | 500K | 692 | 1093 | +401 | **-57.9%** |
+| 10,000 | 100K | 148 | 210 | +62 | **-41.9%** |
+| 10,000 | 500K | 767 | 1101 | +334 | **-43.5%** |
+| 1,000,000 | 100K | 989 | 1316 | +327 | **-33.1%** |
+| 1,000,000 | 500K | 4363 | 4165 | -198 | **+4.5%** |
+
+### INT64 结果（B vs A）
+
+| 字典大小 | 行数 | A:基线 (us) | B:SIMD (us) | 差值 | 加速比 |
+|-----------|------|----------------:|------------:|------:|--------:|
+| 100 | 100K | 123 | 342 | +219 | **-178%** |
+| 100 | 500K | 673 | 1721 | +1048 | **-156%** |
+| 10,000 | 100K | 143 | 355 | +212 | **-148%** |
+| 10,000 | 500K | 759 | 1783 | +1024 | **-135%** |
+| 1,000,000 | 100K | 1900 | 2426 | +526 | **-27.7%** |
+| 1,000,000 | 500K | 3796 | 5494 | +1698 | **-44.7%** |
+
+### String 结果（B vs A）
+
+| 字典大小 | 行数 | A:基线 (us) | B:SIMD (us) | 差值 | 加速比 |
+|-----------|------|----------------:|------------:|------:|--------:|
+| 100 | 100K | 610 | 661 | +51 | **-8.4%** |
+| 100 | 500K | 15620 | 15810 | +190 | **-1.2%** |
+| 10,000 | 100K | 747 | 812 | +65 | **-8.7%** |
+| 10,000 | 500K | 16123 | 17175 | +1052 | **-6.5%** |
+| 1,000,000 | 100K | 10089 | 11340 | +1251 | **-12.4%** |
+| 1,000,000 | 500K | 38985 | 43263 | +4278 | **-11.0%** |
+
+### P1-4 分析
+
+**INT32 SIMD Gather 全面更慢：**
+- 对于 L1/L2 缓存可容纳的字典（100-10K 条目），SIMD 比标量**慢 42-58%**。`_mm256_i32gather_epi32` 指令在 Alder Lake 上有 ~12 周期延迟。对于缓存命中的数据，标量循环凭借良好的分支预测和乱序执行，每次迭代仅需 ~1-2 个周期，远优于批量 gather。
+- 仅在 dict=1M/500K 行时，SIMD 才显示出 **+4.5%** 的微弱提升，此时缓存未命中占主导，gather 指令的内部预取机制部分起作用。
+
+**INT64 SIMD Gather 表现极差：**
+- `_mm256_i32gather_epi64` 每条指令仅处理 4 个值（INT32 吞吐量的一半），延迟相似。结果是小/中型字典时**慢 135-178%**，即使 dict=1M 也仍然**慢 28-45%**。
+- 该指令从根本上不适合 Alder Lake 上的此类工作负载。
+
+**String Buffer 复用略有负面影响：**
+- 使用 `resize()` + 赋值的可复用 `_string_values_buf` 路径在所有字典大小上均**慢 1-12%**。原始的 `reserve() + emplace_back()` 模式已被编译器充分优化。
+
+**结论：P1-4 SIMD 应默认禁用。** `enable_parquet_simd_dict_decode` 配置应默认为 `false`。
+
+---
+
+## P1-5：缓存感知字典预取（C 组 vs A 组）
+
+### 描述
+
+当字典大小超过 L2 缓存（~256KB）时，启用软件预取（`__builtin_prefetch`）以隐藏标量字典查找中的缓存未命中延迟。当字典较小（可放入 L1/L2）时，通过 `_dict_exceeds_l2_cache` 标志自动跳过预取 — 因此 A 组和 C 组在小字典时应表现一致。
+
+### INT32 结果（C vs A）
+
+| 字典大小 | 行数 | A:基线 (us) | C:预取 (us) | 差值 | 加速比 |
+|-----------|------|----------------:|-----------------:|------:|--------:|
+| 100 | 100K | 129 | 129 | 0 | 0%（正确，未发出预取） |
+| 100 | 500K | 692 | 696 | +4 | -0.6%（噪声） |
+| 10,000 | 100K | 148 | 147 | -1 | +0.7%（噪声） |
+| 10,000 | 500K | 767 | 769 | +2 | -0.3%（噪声） |
+| 1,000,000 | 100K | 989 | 1150 | +161 | **-16.3%** |
+| 1,000,000 | 500K | 4363 | 4817 | +454 | **-10.4%** |
+
+### INT64 结果（C vs A）
+
+| 字典大小 | 行数 | A:基线 (us) | C:预取 (us) | 差值 | 加速比 |
+|-----------|------|----------------:|-----------------:|------:|--------:|
+| 100 | 100K | 123 | 116 | -7 | +5.7%（噪声/方差） |
+| 100 | 500K | 673 | 652 | -21 | +3.1%（噪声） |
+| 10,000 | 100K | 143 | 147 | +4 | -2.8%（噪声） |
+| 10,000 | 500K | 759 | 780 | +21 | -2.8% |
+| 1,000,000 | 100K | 1900 | 2250 | +350 | **-18.4%** |
+| 1,000,000 | 500K | 3796 | 4838 | +1042 | **-27.4%** |
+
+### String 结果（C vs A）
+
+| 字典大小 | 行数 | A:基线 (us) | C:预取 (us) | 差值 | 加速比 |
+|-----------|------|----------------:|-----------------:|------:|--------:|
+| 100 | 100K | 610 | 624 | +14 | -2.3%（噪声） |
+| 100 | 500K | 15620 | 15730 | +110 | -0.7%（噪声） |
+| 10,000 | 100K | 747 | 762 | +15 | -2.0%（噪声） |
+| 10,000 | 500K | 16123 | 16371 | +248 | -1.5% |
+| 1,000,000 | 100K | 10089 | 9772 | -317 | **+3.1%** |
+| 1,000,000 | 500K | 38985 | 39994 | +1009 | **-2.6%** |
+
+### P1-5 分析
+
+**小/中型字典（100-10K）：** 结果在噪声范围内（±3%），确认 `_dict_exceeds_l2_cache` 标志正确阻止了对缓存可容纳字典的预取。
+
+**大型字典（1M 条目）— INT32/INT64：** 软件预取比无预取**慢 10-27%**。这与预期相反。根本原因：
+1. 标量循环的访问模式（顺序索引数组，随机字典访问）已经触发了 Alder Lake L2 预取器的硬件预取。
+2. 距离为 8 的软件 `__builtin_prefetch` 与硬件预取竞争，导致**缓存行抖动** — 驱逐有用的字典条目，换入可能不需要的条目。
+3. 对于真正的随机访问模式（dict=1M），工作集太大，预取无法帮助 — 下一次访问不太可能在上一次附近，使预取预测无效。
+
+**大型字典（1M 条目）— String：** 结果不一：100K 行时 +3.1%，500K 行时 -2.6%。StringRef 的间接访问（16 字节指针+长度 → 实际字符串体）创建了两级访问模式，有时可从预取中受益。
+
+**结论：P1-5 预取应默认禁用。** `enable_parquet_dict_prefetch` 配置应默认为 `false`。在拥有良好硬件预取器的现代 CPU 上，软件预取对此类工作负载适得其反。
+
+---
+
+## P1-4+P1-5 组合（D 组 vs A 组）
+
+### 描述
+
+同时启用 SIMD gather 和软件预取。
+
+### INT32 结果（D vs A）
+
+| 字典大小 | 行数 | A:基线 (us) | D:SIMD+预取 (us) | 差值 | 加速比 |
+|-----------|------|----------------:|----------------:|------:|--------:|
+| 100 | 100K | 129 | 208 | +79 | **-61.2%** |
+| 100 | 500K | 692 | 1082 | +390 | **-56.4%** |
+| 10,000 | 100K | 148 | 207 | +59 | **-39.9%** |
+| 10,000 | 500K | 767 | 1074 | +307 | **-40.0%** |
+| 1,000,000 | 100K | 989 | 1162 | +173 | **-17.5%** |
+| 1,000,000 | 500K | 4363 | 4166 | -197 | **+4.5%** |
+
+### INT64 结果（D vs A）
+
+| 字典大小 | 行数 | A:基线 (us) | D:SIMD+预取 (us) | 差值 | 加速比 |
+|-----------|------|----------------:|----------------:|------:|--------:|
+| 100 | 100K | 123 | 340 | +217 | **-176%** |
+| 100 | 500K | 673 | 1776 | +1103 | **-164%** |
+| 10,000 | 100K | 143 | 355 | +212 | **-148%** |
+| 10,000 | 500K | 759 | 1876 | +1117 | **-147%** |
+| 1,000,000 | 100K | 1900 | 2450 | +550 | **-28.9%** |
+| 1,000,000 | 500K | 3796 | 5672 | +1876 | **-49.4%** |
+
+### String 结果（D vs A）
+
+| 字典大小 | 行数 | A:基线 (us) | D:SIMD+预取 (us) | 差值 | 加速比 |
+|-----------|------|----------------:|----------------:|------:|--------:|
+| 100 | 100K | 610 | 668 | +58 | **-9.5%** |
+| 100 | 500K | 15620 | 15391 | -229 | +1.5% |
+| 10,000 | 100K | 747 | 847 | +100 | **-13.4%** |
+| 10,000 | 500K | 16123 | 16595 | +472 | **-2.9%** |
+| 1,000,000 | 100K | 10089 | 10374 | +285 | **-2.8%** |
+| 1,000,000 | 500K | 38985 | 40938 | +1953 | **-5.0%** |
+
+### 组合分析
+
+组合路径（D）的表现与仅 SIMD（B）基本一致，因为 SIMD gather 路径占主导。在 SIMD 之上添加预取几乎没有额外益处或损害，原因是：
+1. 对于小/中型字典，SIMD 已是瓶颈（gather 延迟），且预取未被发出。
+2. 对于发出预取的大型字典，SIMD gather 已进行自身的内部预取，使软件预取变得多余。
+
+**组合路径在所有 INT32/INT64 场景中均未超过基线，仅 dict=1M/500K INT32 例外（+4.5%）。**
+
+---
+
+## P1-6：Plain 编码 memcpy 快速路径（E 组）
+
+### 描述
+
+在 `FixLengthPlainDecoder::_decode_values<false>()`（无过滤路径）中添加短路逻辑：当零空值时，整个批次是一个连续的 CONTENT run，因此用**单次 `memcpy`** 替代 run 循环迭代。这在数学上等价 — 零行为变化，无需配置开关。
+
+### 结果
+
+| 类型 | 行数 | 快速路径 (us) | 含空值 (us) | 加速比 | 倍率 |
+|------|------|---------------:|---------------:|--------:|------:|
+| INT32 (4B) | 100K | **13.5** | 46.2 | **+71%** | **3.4x** |
+| INT32 (4B) | 500K | **139** | 374 | **+63%** | **2.7x** |
+| INT32 (4B) | 1M | **284** | 792 | **+64%** | **2.8x** |
+| INT64 (8B) | 100K | **49.9** | 68.3 | **+27%** | **1.4x** |
+| INT64 (8B) | 500K | **282** | 471 | **+40%** | **1.7x** |
+| INT64 (8B) | 1M | **3911** | 4865 | **+20%** | **1.2x** |
+
+### P1-6 分析
+
+memcpy 快速路径带来了**卓越且一致的性能提升**：
+
+**INT32 (4B)：** 2.7x-3.4x 加速。`get_next_run()` + switch 语句 + 逐 run memcpy 的循环开销被完全消除。单次 `memcpy` 整个批次达到接近最大内存带宽：
+- 100K 行 × 4B = 400KB → 13.5us → **29.6 GB/s** 有效吞吐量
+- 1M 行 × 4B = 4MB → 284us → **14.1 GB/s**（此规模受 L3/内存带宽限制）
+
+**INT64 (8B)：** 1.2x-1.7x 加速。提升幅度较小，原因是：
+- 更大的元素尺寸意味着瓶颈更快从指令开销转移到内存带宽
+- 在 1M 行 × 8B = 8MB 时，两条路径都基本受内存带宽限制，差距缩小
+- 仍有 20-40% 的显著提升
+
+**适用场景：** 此优化在 `num_nulls == 0` 且 `has_filter == false` 时生效，这是以下常见场景：
+- 非空列（分区键、主键、许多聚合目标列）
+- 未涉及下推谓词的列
+- 典型数据湖工作负载中大多数 Plain 编码的 Parquet 列
+
+**无需配置开关：** 这是纯算法优化，零行为变化，性能提升普遍为正。
+
+---
+
+## 总体概览
+
+### 加速热力图（中位数，vs 基线）
+
+正值 = 更快，负值 = 更慢。加粗 = 统计显著。
+
+| 优化项 | INT32 小字典 | INT32 大字典 | INT64 小字典 | INT64 大字典 | String 小字典 | String 大字典 |
+|-------------|:---:|:---:|:---:|:---:|:---:|:---:|
+| B: P1-4 SIMD | **-58%** | +4.5% | **-178%** | **-45%** | **-8%** | **-11%** |
+| C: P1-5 预取 | 0% | **-10%** | 0% | **-27%** | 0% | +3%/-3% |
+| D: P1-4+P1-5 | **-61%** | +4.5% | **-176%** | **-49%** | **-10%** | **-5%** |
+| E: P1-6 快速路径 | 不适用 | 不适用 | 不适用 | 不适用 | 不适用 | 不适用 |
+
+| 优化项 | INT32 100K | INT32 500K | INT32 1M | INT64 100K | INT64 500K | INT64 1M |
+|-------------|:---:|:---:|:---:|:---:|:---:|:---:|
+| E: P1-6 快速路径 | **+71% (3.4x)** | **+63% (2.7x)** | **+64% (2.8x)** | **+27% (1.4x)** | **+40% (1.7x)** | **+20% (1.2x)** |
+
+### 建议
+
+| 优先级 | 优化项 | 默认配置 | 理由 |
+|:--------:|-------------|:--------------:|-----------|
+| **1** | **P1-6 Plain 快速路径** | 始终开启（无开关） | INT32 **2.7x-3.4x** 加速，INT64 **1.2x-1.7x** 加速。零风险，纯算法等价。P1 系列中影响最大的单项优化。 |
+| **2** | P1-4 SIMD Gather | **`false`**（禁用） | SIMD gather 在绝大多数实际工作负载（小/中型字典）中**慢 42-178%**。仅在 dict=1M INT32 时有微弱收益。AVX2 gather 指令与 Alder Lake 上的此访问模式从根本上不匹配。 |
+| **3** | P1-5 软件预取 | **`false`**（禁用） | INT32/INT64 大字典上软件预取**慢 10-27%**。现代 CPU 硬件预取器已能处理该访问模式。软件预取与硬件预取竞争，导致缓存污染。 |
+
+### 根因分析：为什么 SIMD Gather 失败
+
+`_mm256_i32gather_epi32` 指令在理论上很有吸引力（每条指令 8 个值），但在此场景中表现不佳，原因是：
+
+1. **Gather 延迟 vs 标量指令级并行**：gather 指令在 Alder Lake 上有 ~12 周期延迟，并将所有 8 次内存访问串行化。标量循环通过乱序执行和指令级并行，每个值仅需 ~1-2 个周期。
+
+2. **缓存行为不匹配**：对于 L1 驻留的字典（dict=100-10K），标量加载在 ~4 个周期内命中 L1，且 CPU 可同时流水线化多个加载。gather 指令无法利用此并行性 — 它必须等待所有 8 个地址后才能发出任何加载。
+
+3. **INT64 尤其糟糕**：`_mm256_i32gather_epi64` 每条指令仅处理 4 个值（INT32 吞吐量的一半），延迟开销相似。吞吐量优势完全消失。
+
+4. **编译器自动向量化**：使用 `-O3` 的标量循环可能已经受益于编译器自动向量化优化，这些优化比手动 AVX2 intrinsics 更适合特定的访问模式。
+
+### 根因分析：为什么软件预取失败
+
+1. **硬件预取器竞争**：Intel Alder Lake 拥有复杂的 L2 步长和流预取器。添加软件 `__builtin_prefetch` 会创建冲突的预取流。
+
+2. **随机访问模式**：对于 dict=1M 的随机索引访问，下一个字典条目地址基本不可预测。当下一个需要的条目可能在 4MB 字典的任何位置时，预取 `i+8` 处的条目是对缓存带宽的浪费。
+
+3. **缓存污染**：每次软件预取将一个 64 字节缓存行带入 L1/L2。在对 4MB 字典的随机访问中，这些预取的缓存行很可能在使用前就被驱逐，取代了有用的数据。
+
+---
+
+## 配置变更建议
+
+```cpp
+// be/src/common/config.cpp — 建议的变更：
+DEFINE_mBool(enable_parquet_simd_dict_decode, "false");  // 从 "true" 改为 "false"
+DEFINE_mBool(enable_parquet_dict_prefetch, "false");      // 从 "true" 改为 "false"
+```
+
+两个配置保留为 `mBool`（运行时可修改），以便用户在不同硬件上实验。在 gather 吞吐量更好的 CPU 上（如 AMD Zen 4、Intel Sapphire Rapids），SIMD gather 的表现可能不同。
+
+---
+
+## 测试执行详情
+
+**日期**：2026-02-11
+
+**基准测试二进制文件**：`be/build_benchmark/bin/benchmark_test`
+
+**命令**：
+```bash
+export JAVA_HOME=/mnt/disk2/chenqi/jdk-17.0.8
+export LD_LIBRARY_PATH="${JAVA_HOME}/lib/server:${JAVA_HOME}/lib:${LD_LIBRARY_PATH}"
+export DORIS_HOME=/mnt/disk2/chenqi/doris-master3/be/build_benchmark
+cd /mnt/disk2/chenqi/doris-master3/be/build_benchmark
+
+# INT32 A/B/C/D 组
+./bin/benchmark_test --benchmark_filter="BM_Group[ABCD]_INT32" \
+  --benchmark_repetitions=5 --benchmark_report_aggregates_only=true
+
+# INT64 A/B/C/D 组
+./bin/benchmark_test --benchmark_filter="BM_Group[ABCD]_INT64" \
+  --benchmark_repetitions=5 --benchmark_report_aggregates_only=true
+
+# String A/B/C/D 组
+./bin/benchmark_test --benchmark_filter="BM_Group[ABCD]_String" \
+  --benchmark_repetitions=5 --benchmark_report_aggregates_only=true
+
+# P1-6 Plain 快速路径（E 组）
+./bin/benchmark_test --benchmark_filter="BM_GroupE" \
+  --benchmark_repetitions=5 --benchmark_report_aggregates_only=true
+```
+
+**修改的文件**：
+- `be/benchmark/benchmark_p1_decoder_opts.hpp` — 包含 A/B/C/D/E 组的基准测试文件
+- `be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp` — SIMD gather + 预取 + 三路分支
+- `be/src/vec/exec/format/parquet/byte_array_dict_decoder.h/.cpp` — String 复用 + 预取 + 三路分支
+- `be/src/vec/exec/format/parquet/fix_length_plain_decoder.h` — memcpy 快速路径
+- `be/src/common/config.h/.cpp` — `enable_parquet_simd_dict_decode`、`enable_parquet_dict_prefetch` 配置
diff --git a/docs/Parquet_Reader_P0_Optimization_Implementation_Plan.md b/docs/Parquet_Reader_P0_Optimization_Implementation_Plan.md
new file mode 100644
index 00000000000000..90f9f4da5e9793
--- /dev/null
+++ b/docs/Parquet_Reader_P0_Optimization_Implementation_Plan.md
@@ -0,0 +1,1258 @@
+# Doris Parquet Reader P0 优化方向详细实现方案
+
+> 基于 Doris 现有代码结构和 StarRocks 参考实现，给出三个 P0 优化方向的详细实现方案。
+
+---
+
+## P0-1：Filter Bitmap 下推到 Decoder 层
+
+### 1.1 问题分析
+
+#### 当前数据流
+
+```
+ScalarColumnReader::read_column_data(filter_map)
+  → _read_values(filter_map)
+    → ColumnSelectVector::init(null_map, filter_map)  // 合并 null + filter 为 4 种 run
+    → ColumnChunkReader::decode_values(select_vector)
+      → Decoder::decode_values(doris_column, data_type, select_vector, is_dict_filter)
+```
+
+#### 浪费点 1：字典 Index 全量解码
+
+**文件**: `be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp:97-98`
+
+```cpp
+// _decode_values<has_filter>() 中：
+size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
+_indexes.resize(non_null_size);
+_index_batch_decoder->GetBatch(_indexes.data(), non_null_size);  // 解码 ALL 非空 index
+```
+
+所有非空行的 RLE dict index 被全部解码，包括那些将被 `FILTERED_CONTENT` 跳过的行。在低选择率场景（如 5% 存活），95% 的 index 解码是浪费的。
+
+**同样存在于**: `be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp:116-117`
+
+#### 浪费点 2：BaseDictDecoder::skip_values() 的无效解码
+
+**文件**: `be/src/vec/exec/format/parquet/decoder.h:149-153`
+
+```cpp
+Status skip_values(size_t num_values) override {
+    _indexes.resize(num_values);
+    _index_batch_decoder->GetBatch(_indexes.data(), num_values);  // 解码后丢弃
+    return Status::OK();
+}
+```
+
+跳过值时仍需完整解码 RLE index 到内存，然后丢弃。缺少 `RleBatchDecoder::Skip()` 方法。
+
+#### 浪费点 3：FILTERED_CONTENT 行的字典值 Lookup
+
+在 `_decode_fixed_values<true>()` 中 (`fix_length_dict_decoder.hpp:166-168`)：
+```cpp
+case ColumnSelectVector::FILTERED_CONTENT: {
+    dict_index += run_length;  // 跳过 index，但 index 已经在上面被解码了
+    break;
+}
+```
+
+虽然 FILTERED_CONTENT 不做 dict lookup（只是 `dict_index += run_length`），但这些 index 已经在步骤 1 被解码出来了。
+
+### 1.2 实现方案
+
+#### 方案概述
+
+**不修改 ColumnSelectVector 机制**，在 Decoder 内部接收原始 filter bitmap，当选择率 < 阈值时，用 filter bitmap 跳过无用的字典值 lookup（对于大字典尤其有效，减少 cache miss）。
+
+#### 步骤 1：为 RleBatchDecoder 添加 SkipBatch 方法
+
+**文件**: `be/src/util/rle_encoding.h`
+
+```cpp
+template <typename T>
+class RleBatchDecoder {
+public:
+    // 已有方法
+    int32_t GetBatch(T* values, uint32_t batch_size);
+
+    // 新增：跳过 num_values 个值，不写入任何缓冲区
+    int32_t SkipBatch(uint32_t num_values) {
+        DCHECK_GT(num_values, 0);
+        int32_t num_skipped = 0;
+        while (num_skipped < num_values) {
+            if (UNLIKELY(num_buffered_values_ == 0)) {
+                if (UNLIKELY(!NextCounts<T>())) return num_skipped;
+            }
+            uint32_t to_skip = std::min<uint32_t>(
+                num_values - num_skipped, num_buffered_values_);
+            if (repeat_count_ > 0) {
+                // RLE run：直接减少 repeat_count_
+                uint32_t skip = std::min<uint32_t>(to_skip, repeat_count_);
+                repeat_count_ -= skip;
+                num_buffered_values_ -= skip;
+                num_skipped += skip;
+            } else {
+                // Literal run：推进 literal buffer 位置
+                uint32_t skip = std::min<uint32_t>(to_skip, literal_count_);
+                for (uint32_t i = 0; i < skip; ++i) {
+                    // 需要从 bit reader 读取并丢弃
+                    T unused;
+                    if (!bit_reader_.GetValue(bit_width_, &unused)) return num_skipped;
+                }
+                literal_count_ -= skip;
+                num_buffered_values_ -= skip;
+                num_skipped += skip;
+            }
+        }
+        return num_skipped;
+    }
+};
+```
+
+**注意**：检查 Doris 现有的 `RleBatchDecoder` 实现（可能已有类似方法，需确认）。如果 literal run 的跳过无法避免 bit 读取，至少能避免内存分配和写入。
+
+#### 步骤 2：修改 BaseDictDecoder::skip_values()
+
+**文件**: `be/src/vec/exec/format/parquet/decoder.h:149-153`
+
+```cpp
+// 修改前
+Status skip_values(size_t num_values) override {
+    _indexes.resize(num_values);
+    _index_batch_decoder->GetBatch(_indexes.data(), num_values);
+    return Status::OK();
+}
+
+// 修改后
+Status skip_values(size_t num_values) override {
+    auto skipped = _index_batch_decoder->SkipBatch(cast_set<uint32_t>(num_values));
+    if (UNLIKELY(skipped < num_values)) {
+        return Status::InternalError("RLE skip error, not enough values");
+    }
+    return Status::OK();
+}
+```
+
+**收益**：消除 skip 场景下的内存分配 (`_indexes.resize`) 和无效写入。
+
+#### 步骤 3：修改 Decoder 接口，添加 filter bitmap 参数
+
+**文件**: `be/src/vec/exec/format/parquet/decoder.h:69-70`
+
+```cpp
+// 修改前
+virtual Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
+                             ColumnSelectVector& select_vector, bool is_dict_filter) = 0;
+
+// 修改后：添加可选的 filter bitmap 参数
+virtual Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
+                             ColumnSelectVector& select_vector, bool is_dict_filter,
+                             const uint8_t* filter_data = nullptr) = 0;
+```
+
+#### 步骤 4：在 ColumnChunkReader 中传递 filter bitmap
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp:528-544`
+
+修改 `ColumnChunkReader::decode_values()` 签名，添加 `filter_data` 参数并转发给 decoder：
+
+```cpp
+template <bool IN_COLLECTION, bool OFFSET_INDEX>
+Status ColumnChunkReader<IN_COLLECTION, OFFSET_INDEX>::decode_values(
+        MutableColumnPtr& doris_column, DataTypePtr& data_type,
+        ColumnSelectVector& select_vector, bool is_dict_filter,
+        const uint8_t* filter_data) {
+    // ... 现有检查 ...
+    return _page_decoder->decode_values(doris_column, data_type, select_vector,
+                                         is_dict_filter, filter_data);
+}
+```
+
+同步修改头文件 `vparquet_column_chunk_reader.h` 中的声明。
+
+#### 步骤 5：在 ScalarColumnReader::_read_values() 中决策并传递 filter bitmap
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:390-397`
+
+```cpp
+// 在 ColumnSelectVector::init() 之后，decode_values() 之前：
+const uint8_t* filter_data_for_decoder = nullptr;
+if (select_vector.has_filter() && filter_map.has_filter()) {
+    // 计算选择率
+    size_t total = select_vector.num_values();
+    size_t filtered = select_vector.num_filtered();
+    double selectivity = 1.0 - static_cast<double>(filtered) / total;
+    // 选择率 < 20% 时下推 filter bitmap
+    if (selectivity < 0.2) {
+        filter_data_for_decoder = filter_map.filter_map_data() + _filter_map_index - num_values;
+    }
+}
+return _chunk_reader->decode_values(data_column, type, select_vector,
+                                     is_dict_filter, filter_data_for_decoder);
+```
+
+#### 步骤 6：修改 FixLengthDictDecoder 使用 filter bitmap
+
+**文件**: `be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp`
+
+```cpp
+Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
+                     ColumnSelectVector& select_vector, bool is_dict_filter,
+                     const uint8_t* filter_data = nullptr) override {
+    if (select_vector.has_filter()) {
+        return _decode_values<true>(doris_column, data_type, select_vector,
+                                    is_dict_filter, filter_data);
+    } else {
+        return _decode_values<false>(doris_column, data_type, select_vector,
+                                     is_dict_filter, nullptr);
+    }
+}
+
+template <bool has_filter>
+Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
+                      ColumnSelectVector& select_vector, bool is_dict_filter,
+                      const uint8_t* filter_data) {
+    size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
+
+    // ... dict column 初始化代码不变 ...
+
+    // 仍需全量解码 RLE index（RLE 是顺序解码，无法跳过）
+    _indexes.resize(non_null_size);
+    _index_batch_decoder->GetBatch(_indexes.data(), cast_set<uint32_t>(non_null_size));
+
+    if (doris_column->is_column_dictionary() || is_dict_filter) {
+        return _decode_dict_values<has_filter>(doris_column, select_vector, is_dict_filter);
+    }
+
+    return _decode_fixed_values<has_filter>(doris_column, data_type, select_vector, filter_data);
+}
+```
+
+修改 `_decode_fixed_values` 在 `CONTENT` 分支中利用 filter bitmap：
+
+```cpp
+template <bool has_filter>
+Status _decode_fixed_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
+                            ColumnSelectVector& select_vector,
+                            const uint8_t* filter_data) {
+    // ... 现有的 resize 和 raw_data 获取 ...
+    size_t dict_index = 0;
+    size_t filter_offset = 0;  // 跟踪 filter bitmap 位置
+    ColumnSelectVector::DataReadType read_type;
+    while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
+        switch (read_type) {
+        case ColumnSelectVector::CONTENT: {
+            if (filter_data != nullptr) {
+                // 有 filter bitmap：仅对 filter[i]=1 的行做 dict lookup
+                for (size_t i = 0; i < run_length; ++i) {
+                    if (filter_data[filter_offset + i]) {
+                        auto& item = _dict_items[_indexes[dict_index]];
+                        memcpy(raw_data + data_index, &item, _type_length);
+                    }
+                    // 无论是否 filter，都要推进 data_index 和 dict_index
+                    data_index += _type_length;
+                    dict_index++;
+                }
+            } else {
+                // 原有路径不变
+                for (size_t i = 0; i < run_length; ++i) {
+                    *(cppType*)(raw_data + data_index) = _dict_items[_indexes[dict_index++]];
+                    data_index += _type_length;
+                }
+            }
+            filter_offset += run_length;
+            break;
+        }
+        case ColumnSelectVector::NULL_DATA: {
+            data_index += run_length * _type_length;
+            filter_offset += run_length;
+            break;
+        }
+        case ColumnSelectVector::FILTERED_CONTENT: {
+            dict_index += run_length;
+            break;
+        }
+        case ColumnSelectVector::FILTERED_NULL: {
+            break;
+        }
+        }
+    }
+    return Status::OK();
+}
+```
+
+**核心收益**：在 `CONTENT` run 中，`filter_data[i]=0` 的行跳过 `_dict_items[_indexes[...]]` 的随机内存访问。对于大字典（> L2 cache），这可以显著减少 cache miss。
+
+#### 步骤 7：同样修改 ByteArrayDictDecoder
+
+**文件**: `be/src/vec/exec/format/parquet/byte_array_dict_decoder.h` 和 `.cpp`
+
+对 `ByteArrayDictDecoder::_decode_values<has_filter>()` 做类似修改：在 `CONTENT` 分支中，仅对 `filter_data[i]=1` 的行执行 `_dict_items[_indexes[dict_index]]` 的 StringRef 构造和 `insert_many_strings_overflow()`。
+
+对于 string 类型，收益更大：跳过 filter 的行不仅避免了 dict lookup 的 cache miss，还避免了 string copy。
+
+#### 步骤 8：添加配置开关
+
+**文件**: `be/src/common/config.h`
+
+```cpp
+CONF_mBool(parquet_push_down_filter_to_decoder_enable, "true");
+```
+
+**文件**: `be/src/common/config.cpp` 中注册。
+
+在步骤 5 的选择率判断中加入配置检查：
+
+```cpp
+if (selectivity < 0.2 && config::parquet_push_down_filter_to_decoder_enable) {
+    filter_data_for_decoder = ...;
+}
+```
+
+### 1.3 涉及修改的文件清单
+
+| 文件 | 修改内容 |
+|------|----------|
+| `be/src/util/rle_encoding.h` | 添加 `RleBatchDecoder::SkipBatch()` 方法 |
+| `be/src/vec/exec/format/parquet/decoder.h` | 修改 `Decoder::decode_values()` 签名（添加 `filter_data`）；修改 `BaseDictDecoder::skip_values()` 使用 SkipBatch |
+| `be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h` | 修改 `decode_values()` 签名 |
+| `be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp` | 转发 `filter_data` |
+| `be/src/vec/exec/format/parquet/vparquet_column_reader.cpp` | 计算选择率，决策是否下推 filter bitmap |
+| `be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp` | CONTENT 分支利用 filter bitmap 跳过 dict lookup |
+| `be/src/vec/exec/format/parquet/byte_array_dict_decoder.h/.cpp` | 同上 |
+| `be/src/vec/exec/format/parquet/fix_length_plain_decoder.h/.cpp` | 签名同步修改（Plain 编码受益较小，可选实现） |
+| `be/src/vec/exec/format/parquet/byte_array_plain_decoder.h/.cpp` | 签名同步修改 |
+| `be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h` | 签名同步修改 |
+| `be/src/vec/exec/format/parquet/byte_stream_split_decoder.h` | 签名同步修改 |
+| `be/src/vec/exec/format/parquet/bss_page_decoder.h` | 签名同步修改 |
+| `be/src/common/config.h` | 新增 `parquet_push_down_filter_to_decoder_enable` |
+
+### 1.4 StarRocks 参考
+
+- **选择率门控**: `stored_column_reader.h:155-161`，`_convert_filter_row_to_value()` 使用 `SIMD::count_nonzero(*filter) * 1.0 / filter->size() < 0.2` 作为阈值
+- **Cache-Aware 门控**: `encoding_dict.h:122-126`，字典 > L2 cache 时才传 filter（可在后续 P1 优化中实现）
+- **Decoder 使用 filter**: `encoding_dict.h:359-363`，`if (filter[i]) { data[i] = _dict[_indexes[i]]; }`
+
+---
+
+## P0-2：谓词列读取顺序优化
+
+### 2.1 问题分析
+
+#### 当前数据流
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:518-725` (`_do_lazy_read()`)
+
+```
+_do_lazy_read():
+  Phase 1: _read_column_data(block, predicate_columns.first, ...)
+           → 读 ALL 谓词列（一次性，schema 顺序）
+           → VExprContext::execute_conjuncts(_filter_conjuncts, ...)
+           → 产出 filter_map
+  Phase 2: _read_column_data(block, lazy_columns, filter_map)
+           → 带 filter 读懒加载列
+```
+
+**问题**：Phase 1 中所有谓词列**一次性全部读取**，没有中间过滤。假设有 3 个谓词列 A、B、C：
+- 列 A 的选择率 5%（过滤掉 95% 的行）
+- 列 B、C 的过滤效果较弱
+
+当前做法：A、B、C 三列全部解码所有行 → 然后整体过滤。
+优化做法：先读 A → 过滤 → 只对存活行读 B → 过滤 → 只对存活行读 C → ...
+
+#### 当前谓词列顺序
+
+谓词列顺序 = **Parquet 文件 schema 顺序**（`vparquet_reader.cpp:539-558`），与列的选择率无关。
+
+### 2.2 实现方案
+
+#### 方案概述
+
+将 `_do_lazy_read()` 的 Phase 1 从"一次性读所有谓词列"改为"逐列读取+中间过滤"，并引入自适应列排序机制选择最优读取顺序。
+
+#### 步骤 1：重构谓词 conjuncts 按列分组
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_group_reader.h`
+
+在 `RowGroupReader` 中新增成员：
+
+```cpp
+// 按列分组的 conjuncts：slot_id -> conjuncts 列表
+// 仅包含单列谓词（引用单个 slot_id 的 conjunct）
+std::unordered_map<int, VExprContextSPtrs> _single_col_filter_conjuncts;
+// 多列谓词（引用多个 slot_id 的 conjunct），在所有涉及列读完后评估
+VExprContextSPtrs _multi_col_filter_conjuncts;
+```
+
+#### 步骤 2：在 init() 中分类 conjuncts
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_group_reader.cpp`
+
+在现有的 `_filter_conjuncts` 构建之后，分析每个 conjunct 引用的列：
+
+```cpp
+void RowGroupReader::_classify_conjuncts_by_column() {
+    for (auto& conjunct : _filter_conjuncts) {
+        std::set<int> referenced_slot_ids;
+        _collect_slot_ids(conjunct->root(), referenced_slot_ids);
+
+        if (referenced_slot_ids.size() == 1) {
+            int slot_id = *referenced_slot_ids.begin();
+            _single_col_filter_conjuncts[slot_id].push_back(conjunct);
+        } else {
+            _multi_col_filter_conjuncts.push_back(conjunct);
+        }
+    }
+}
+
+void RowGroupReader::_collect_slot_ids(VExpr* expr, std::set<int>& slot_ids) {
+    if (expr->is_slot_ref()) {
+        slot_ids.insert(static_cast<VSlotRef*>(expr)->slot_id());
+    }
+    for (auto& child : expr->children()) {
+        _collect_slot_ids(child.get(), slot_ids);
+    }
+}
+```
+
+#### 步骤 3：引入 ColumnReadOrderCtx 类
+
+**新建文件**: `be/src/vec/exec/format/parquet/column_read_order_ctx.h`
+
+```cpp
+#pragma once
+
+#include <vector>
+#include <unordered_map>
+#include <random>
+#include <algorithm>
+
+namespace doris::vectorized {
+
+class ColumnReadOrderCtx {
+public:
+    ColumnReadOrderCtx(std::vector<int> col_slot_ids,
+                       std::unordered_map<int, size_t> col_cost_map,
+                       size_t total_cost)
+        : _best_order(std::move(col_slot_ids)),
+          _col_cost_map(std::move(col_cost_map)),
+          _min_round_cost(total_cost) {}
+
+    // 获取当前轮次的列读取顺序
+    // 前 EXPLORATION_ROUNDS 轮返回随机顺序；之后返回最优顺序
+    const std::vector<int>& get_column_read_order() {
+        if (_exploration_remaining > 0) {
+            _trying_order = _best_order;
+            std::shuffle(_trying_order.begin(), _trying_order.end(),
+                         std::mt19937(std::random_device()()));
+            return _trying_order;
+        }
+        return _best_order;
+    }
+
+    // 每轮结束后更新统计：round_cost = 实际读取的数据量
+    // first_selectivity = 第一列过滤后的存活比例
+    void update(size_t round_cost, double first_selectivity) {
+        if (_exploration_remaining > 0) {
+            if (round_cost < _min_round_cost ||
+                (round_cost == _min_round_cost &&
+                 first_selectivity > 0 && first_selectivity < _best_first_selectivity)) {
+                _best_order = _trying_order;
+                _min_round_cost = round_cost;
+                _best_first_selectivity = first_selectivity;
+            }
+            _trying_order.clear();
+            _exploration_remaining--;
+        }
+    }
+
+    size_t get_column_cost(int slot_id) const {
+        auto it = _col_cost_map.find(slot_id);
+        return it != _col_cost_map.end() ? it->second : 0;
+    }
+
+private:
+    static constexpr int EXPLORATION_ROUNDS = 10;
+
+    std::vector<int> _best_order;        // 已知最优顺序
+    std::vector<int> _trying_order;      // 当前尝试的顺序
+    std::unordered_map<int, size_t> _col_cost_map;  // slot_id -> 平面大小 cost
+    size_t _min_round_cost;
+    double _best_first_selectivity = 1.0;
+    int _exploration_remaining = EXPLORATION_ROUNDS;
+};
+
+} // namespace doris::vectorized
+```
+
+#### 步骤 4：在 RowGroupReader 中初始化 ColumnReadOrderCtx
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_group_reader.cpp`
+
+在 `init()` 最后，如果有谓词列且启用了列顺序优化：
+
+```cpp
+if (_lazy_read_ctx.can_lazy_read &&
+    _lazy_read_ctx.predicate_columns.first.size() > 1) {
+    // 只有多于 1 个谓词列时才需要排序优化
+    std::vector<int> pred_slot_ids = _lazy_read_ctx.predicate_columns.second;
+    std::unordered_map<int, size_t> cost_map;
+    size_t total_cost = 0;
+    for (size_t i = 0; i < pred_slot_ids.size(); ++i) {
+        const auto& col_name = _lazy_read_ctx.predicate_columns.first[i];
+        // cost 使用列的物理类型大小作为近似
+        size_t col_cost = _column_readers[col_name]->get_type_length();
+        cost_map[pred_slot_ids[i]] = col_cost;
+        total_cost += col_cost;
+    }
+    _column_read_order_ctx = std::make_unique<ColumnReadOrderCtx>(
+        pred_slot_ids, std::move(cost_map), total_cost);
+}
+```
+
+#### 步骤 5：重构 _do_lazy_read() Phase 1 为逐列读取
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:518-725`
+
+将 Phase 1 的 `_read_column_data(block, predicate_columns.first, ...)` 替换为逐列读取 + 中间过滤：
+
+```cpp
+Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size,
+                                      size_t* read_rows, bool* batch_eof) {
+    // ... 现有的初始化代码 ...
+
+    while (!_state->is_cancelled()) {
+        // Phase 1: 逐列读取谓词列
+        FilterMap filter_map;  // 初始为空
+
+        if (_column_read_order_ctx) {
+            // === 新路径：逐列读取 + 中间过滤 ===
+            const auto& read_order = _column_read_order_ctx->get_column_read_order();
+            size_t round_cost = 0;
+            double first_selectivity = -1;
+            bool all_filtered = false;
+
+            for (size_t round = 0; round < read_order.size(); ++round) {
+                int slot_id = read_order[round];
+                // 找到对应的列名
+                std::string col_name = _find_col_name_by_slot_id(slot_id);
+
+                round_cost += _column_read_order_ctx->get_column_cost(slot_id);
+
+                // 读取单列（带 filter_map，如果有的话）
+                _read_single_column_data(block, col_name, batch_size,
+                                          &pre_read_rows, &pre_eof, filter_map);
+
+                // 如果该列有单列谓词，执行过滤
+                auto it = _single_col_filter_conjuncts.find(slot_id);
+                if (it != _single_col_filter_conjuncts.end()) {
+                    IColumn::Filter result_filter;
+                    bool can_filter_all = false;
+                    VExprContext::execute_conjuncts(it->second, nullptr,
+                                                    block, &result_filter, &can_filter_all);
+
+                    if (can_filter_all) {
+                        all_filtered = true;
+                        if (first_selectivity < 0) first_selectivity = 0;
+                        break;  // 所有行被过滤，提前退出
+                    }
+
+                    // 更新 filter_map
+                    _update_filter_map_with_result(filter_map, result_filter);
+
+                    if (first_selectivity < 0) {
+                        size_t hit = simd::count_nonzero(result_filter.data(),
+                                                          result_filter.size());
+                        first_selectivity = static_cast<double>(hit) / result_filter.size();
+                    }
+                }
+            }
+
+            // 执行多列谓词（所有列都已读取）
+            if (!all_filtered && !_multi_col_filter_conjuncts.empty()) {
+                // ... 执行 _multi_col_filter_conjuncts ...
+            }
+
+            _column_read_order_ctx->update(round_cost, first_selectivity);
+
+        } else {
+            // === 原有路径：一次性读取所有谓词列 ===
+            _read_column_data(block, _lazy_read_ctx.predicate_columns.first,
+                              batch_size, &pre_read_rows, &pre_eof, filter_map);
+            // ... 原有的 conjunct 执行和 filter_map 构建 ...
+        }
+
+        // ... Phase 2: 读取 lazy 列（不变） ...
+        // ... 后续逻辑不变 ...
+    }
+}
+```
+
+#### 步骤 6：添加辅助方法
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_group_reader.h` 和 `.cpp`
+
+```cpp
+// 按 slot_id 找列名
+std::string RowGroupReader::_find_col_name_by_slot_id(int slot_id) {
+    const auto& names = _lazy_read_ctx.predicate_columns.first;
+    const auto& ids = _lazy_read_ctx.predicate_columns.second;
+    for (size_t i = 0; i < ids.size(); ++i) {
+        if (ids[i] == slot_id) return names[i];
+    }
+    return "";
+}
+
+// 读取单个列
+Status RowGroupReader::_read_single_column_data(
+        Block* block, const std::string& col_name,
+        size_t batch_size, size_t* read_rows, bool* eof,
+        FilterMap& filter_map) {
+    // 与 _read_column_data 类似，但只读一列
+    // 包括 dict filter column 的类型替换逻辑
+    std::vector<std::string> single_col = {col_name};
+    return _read_column_data(block, single_col, batch_size, read_rows, eof, filter_map);
+}
+
+// 合并新的 filter 结果到已有的 filter_map
+void RowGroupReader::_update_filter_map_with_result(
+        FilterMap& filter_map, const IColumn::Filter& new_filter) {
+    if (!filter_map.has_filter()) {
+        // 首次过滤：直接使用 new_filter
+        _filter_map_data = new_filter;  // 成员变量存储
+        filter_map.init(_filter_map_data.data(), _filter_map_data.size(), false);
+    } else {
+        // 后续过滤：AND 合并
+        const uint8_t* existing = filter_map.filter_map_data();
+        for (size_t i = 0; i < new_filter.size(); ++i) {
+            _filter_map_data[i] &= new_filter[i];
+        }
+        bool all_zero = simd::count_zero_num(_filter_map_data.data(),
+                                              _filter_map_data.size())
+                         == _filter_map_data.size();
+        filter_map.init(_filter_map_data.data(), _filter_map_data.size(), all_zero);
+    }
+}
+```
+
+### 2.3 涉及修改的文件清单
+
+| 文件 | 修改内容 |
+|------|----------|
+| **新建** `be/src/vec/exec/format/parquet/column_read_order_ctx.h` | ColumnReadOrderCtx 类定义 |
+| `be/src/vec/exec/format/parquet/vparquet_group_reader.h` | 新增成员：`_column_read_order_ctx`、`_single_col_filter_conjuncts`、`_multi_col_filter_conjuncts`、`_filter_map_data` |
+| `be/src/vec/exec/format/parquet/vparquet_group_reader.cpp` | 重构 `_do_lazy_read()` Phase 1；新增 `_classify_conjuncts_by_column()`、`_read_single_column_data()`、`_update_filter_map_with_result()` |
+
+### 2.4 StarRocks 参考
+
+- **ColumnReadOrderCtx**: `column_read_order_ctx.h:24-54`，10 次随机搜索 + cost-based 选择
+- **逐列读取**: `group_reader.cpp:272-335` `_read_range_round_by_round()`，每列读完后执行 dict filter 和 non-dict conjuncts
+- **提前退出**: `hit_count == 0` 时立即返回，跳过后续列
+- **Cost 度量**: 使用 `slot_type().get_flat_size()` 作为列 cost
+
+### 2.5 注意事项
+
+1. **dict filter 列的处理**：逐列读取时，dict filter 列的类型替换（String → Int32）和 dict conjunct 评估需要在对应列读取后立即执行，而非等所有列读完。
+
+2. **谓词的列归属**：有些 conjunct 可能引用多个列（如 `WHERE a + b > 10`），这些无法在单列读完后评估，需要延迟到所有涉及列读完后执行。
+
+3. **探索期性能**：前 10 个 batch 使用随机顺序，可能不是最优。但由于每个 batch 通常有数千行，10 个 batch 的探索开销可以接受。
+
+4. **只对 lazy read 路径有效**：非 lazy read 路径（所有列同时读取）不适用此优化。但 lazy read 是最常见的分析查询模式。
+
+---
+
+## P0-3：Lazy Dictionary Decode
+
+### 3.1 问题分析
+
+#### 当前字典过滤流程
+
+Doris 已有一套字典过滤机制，但与 StarRocks 的 Lazy Dict Decode 有本质区别：
+
+**Doris 现有 Dict Filter 流程** (`vparquet_group_reader.cpp:1042-1266`):
+
+```
+1. init() 时：
+   _rewrite_dict_predicates()
+   → 读取字典页所有值到 ColumnString
+   → 在字典值上执行 conjuncts
+   → 收集存活的 dict codes
+   → 将 string 谓词改写为 int32 IN/EQ 谓词
+
+2. 读取时 (_read_column_data):
+   → 列类型替换：DataTypeString → DataTypeInt32
+   → ByteArrayDictDecoder 输出 int32 dict codes（而非 string）
+   → 执行改写后的 int32 谓词
+
+3. 过滤后 (_convert_dict_cols_to_string_cols):
+   → ColumnInt32 → 查字典 → ColumnString
+```
+
+**局限性**：
+- 只对**有 IN/EQ 谓词的 string 列**有效（`_can_filter_by_dict()` 严格限制）
+- 不是 "Lazy Decode"，而是 "Predicate Rewrite" — 谓词改写为 dict code 上的操作
+- 对于**没有谓词但属于懒加载列的 string 列**，无法利用字典编码的优势
+
+#### StarRocks 的 Lazy Dict Decode 范围更广
+
+StarRocks 的 Lazy Dict Decode 不仅用于有谓词的列，还用于**所有 lazy 列的 string 类型字典编码列**。核心思想是：
+
+1. 先只读 dict codes (int32) — 非常便宜
+2. 等 active 列过滤后，只对存活行做 dict code → string 的转换
+3. 如果 95% 的行被过滤，就只需转换 5% 的行
+
+### 3.2 实现方案
+
+#### 方案概述
+
+扩展 Doris 现有的 dict filter 机制，使其覆盖到所有 lazy 列中的 string 类型字典编码列，即使这些列没有谓词。
+
+#### 步骤 1：引入 ColumnContentType 枚举
+
+**新建文件**: `be/src/vec/exec/format/parquet/parquet_utils.h`（或添加到 `parquet_common.h`）
+
+```cpp
+enum class ColumnContentType : uint8_t {
+    VALUE = 0,     // 解码为实际值（string、int 等）
+    DICT_CODE = 1  // 仅输出 dict codes (int32)
+};
+```
+
+#### 步骤 2：修改 Decoder 接口支持 DICT_CODE 输出
+
+**文件**: `be/src/vec/exec/format/parquet/decoder.h`
+
+在 `Decoder` 基类中添加 DICT_CODE 模式支持。但考虑到 Doris 已有 `is_dict_filter` 参数实现了类似功能（当 `is_dict_filter=true` 时，`BaseDictDecoder::_decode_dict_values` 输出 int32），可以**复用现有机制**：
+
+```cpp
+// 现有接口不变，但扩展 is_dict_filter 的含义：
+// is_dict_filter=true → 输出 dict codes (int32) 到 doris_column
+// 这与 StarRocks 的 ColumnContentType::DICT_CODE 等价
+```
+
+因此不需要修改 Decoder 接口。Doris 现有的 `is_dict_filter=true` + `_decode_dict_values` 已经能输出 dict codes。
+
+#### 步骤 3：在 LazyReadContext 中标记可延迟解码的列
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_group_reader.h`
+
+在 `LazyReadContext` 中新增：
+
+```cpp
+struct LazyReadContext {
+    // ... 现有成员 ...
+
+    // Lazy Dict Decode：可以延迟字典解码的 lazy 列
+    // (col_name, slot_id) 对
+    std::vector<std::pair<std::string, int>> lazy_dict_decode_columns;
+};
+```
+
+#### 步骤 4：在 set_fill_columns 中识别可延迟解码的列
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_reader.cpp`
+
+在 `set_fill_columns()` 分类 lazy 列时，检查是否满足 lazy dict decode 条件：
+
+```cpp
+// 在 lazy_read_columns 分类之后
+for (auto& lazy_col : _lazy_read_ctx.lazy_read_columns) {
+    // 条件: string 类型列
+    // 全字典编码在 RowGroupReader::init() 时才能确认
+    const auto& slot_desc = _get_slot_desc_by_name(lazy_col);
+    if (slot_desc && slot_desc->type().is_string_type()) {
+        _lazy_read_ctx.lazy_dict_decode_candidates.push_back(
+            {lazy_col, slot_desc->id()});
+    }
+}
+```
+
+#### 步骤 5：在 RowGroupReader::init() 中确认全字典编码
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_group_reader.cpp`
+
+```cpp
+// 在 _column_readers 创建之后
+for (auto& [col_name, slot_id] : _lazy_read_ctx.lazy_dict_decode_candidates) {
+    auto it = _column_readers.find(col_name);
+    if (it != _column_readers.end()) {
+        const auto& column_metadata = _get_column_metadata(col_name);
+        // 复用已有的 _can_filter_by_dict 中的字典编码检查逻辑
+        if (column_metadata.encoding_stats.has_value()) {
+            bool all_dict = true;
+            for (auto& stat : column_metadata.encoding_stats.value()) {
+                if (stat.page_type == tparquet::PageType::DATA_PAGE ||
+                    stat.page_type == tparquet::PageType::DATA_PAGE_V2) {
+                    if (stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY &&
+                        stat.encoding != tparquet::Encoding::RLE_DICTIONARY) {
+                        all_dict = false;
+                        break;
+                    }
+                }
+            }
+            if (all_dict) {
+                _lazy_read_ctx.lazy_dict_decode_columns.push_back({col_name, slot_id});
+            }
+        }
+    }
+}
+```
+
+#### 步骤 6：修改 _do_lazy_read() 中 lazy 列的读取
+
+**文件**: `be/src/vec/exec/format/parquet/vparquet_group_reader.cpp`
+
+在 Phase 2（读取 lazy 列）中，对 `lazy_dict_decode_columns` 中的列使用 dict code 模式读取：
+
+```cpp
+// Phase 2: 读取 lazy 列
+// 先决策：是否使用 lazy dict decode（基于选择率）
+bool use_lazy_dict_decode = false;
+if (!_lazy_read_ctx.lazy_dict_decode_columns.empty() && filter_map.has_filter()) {
+    double selectivity = 1.0 - filter_map.filter_ratio();
+    use_lazy_dict_decode = (selectivity < 0.2);  // 存活率 < 20%
+}
+
+if (use_lazy_dict_decode) {
+    // 分两组读取 lazy 列
+    std::vector<std::string> normal_lazy_cols;
+    std::vector<std::string> dict_decode_lazy_cols;
+    std::set<std::string> dict_decode_set;
+    for (auto& [name, _] : _lazy_read_ctx.lazy_dict_decode_columns) {
+        dict_decode_set.insert(name);
+    }
+    for (auto& col : _lazy_read_ctx.lazy_read_columns) {
+        if (dict_decode_set.count(col)) {
+            dict_decode_lazy_cols.push_back(col);
+        } else {
+            normal_lazy_cols.push_back(col);
+        }
+    }
+
+    // 读取普通 lazy 列（原有路径）
+    if (!normal_lazy_cols.empty()) {
+        _read_column_data(block, normal_lazy_cols, pre_read_rows,
+                          &lazy_read_rows, &lazy_eof, filter_map);
+    }
+
+    // 读取 dict decode lazy 列（dict code 模式）
+    for (auto& col_name : dict_decode_lazy_cols) {
+        // 替换 block 中列类型为 Int32
+        // （复用现有的 dict filter 列类型替换逻辑）
+        _replace_column_type_to_dict_code(block, col_name);
+    }
+    _read_column_data(block, dict_decode_lazy_cols, pre_read_rows,
+                      &lazy_read_rows, &lazy_eof, filter_map,
+                      /*is_dict_filter=*/true);
+} else {
+    // 原有路径：直接读取所有 lazy 列
+    _read_column_data(block, _lazy_read_ctx.lazy_read_columns,
+                      pre_read_rows, &lazy_read_rows, &lazy_eof, filter_map);
+}
+```
+
+#### 步骤 7：在过滤后转换 dict codes 到 strings
+
+在 `_do_lazy_read()` 的后续代码中（Phase 4，过滤后处理），添加 dict code 列的转换：
+
+```cpp
+// 过滤 block
+Block::filter_block_internal(block, filter_columns, result_filter);
+
+// 转换 dict filter 列（已有逻辑）
+_convert_dict_cols_to_string_cols(block);
+
+// 转换 lazy dict decode 列（新增）
+if (use_lazy_dict_decode) {
+    _convert_lazy_dict_cols_to_string_cols(block);
+}
+```
+
+新增方法：
+
+```cpp
+void RowGroupReader::_convert_lazy_dict_cols_to_string_cols(Block* block) {
+    for (auto& [col_name, slot_id] : _lazy_read_ctx.lazy_dict_decode_columns) {
+        // 找到 block 中对应的列
+        auto col_idx = block->get_position_by_name(col_name);
+        auto& col_type_name = block->get_by_position(col_idx);
+        const auto& column = col_type_name.column;
+
+        // 提取 ColumnInt32（可能是 Nullable 包装的）
+        const ColumnInt32* dict_column = nullptr;
+        ColumnPtr null_column = nullptr;
+        if (auto* nullable = check_and_get_column<ColumnNullable>(*column)) {
+            dict_column = assert_cast<const ColumnInt32*>(
+                nullable->get_nested_column_ptr().get());
+            null_column = nullable->get_null_map_column_ptr();
+        } else {
+            dict_column = assert_cast<const ColumnInt32*>(column.get());
+        }
+
+        // 调用 column reader 的字典转换
+        MutableColumnPtr string_col =
+            _column_readers[col_name]->convert_dict_column_to_string_column(dict_column);
+
+        // 替换回 block
+        if (null_column) {
+            col_type_name.type = make_nullable(std::make_shared<DataTypeString>());
+            block->replace_by_position(col_idx,
+                ColumnNullable::create(std::move(string_col),
+                                       null_column->clone_resized(string_col->size())));
+        } else {
+            col_type_name.type = std::make_shared<DataTypeString>();
+            block->replace_by_position(col_idx, std::move(string_col));
+        }
+    }
+}
+```
+
+#### 步骤 8：添加辅助方法
+
+```cpp
+// 替换 block 中列类型为 dict code (Int32)
+void RowGroupReader::_replace_column_type_to_dict_code(Block* block,
+                                                        const std::string& col_name) {
+    auto col_idx = block->get_position_by_name(col_name);
+    auto& col_type_name = block->get_by_position(col_idx);
+    bool is_nullable = col_type_name.type->is_nullable();
+    if (is_nullable) {
+        col_type_name.type = make_nullable(std::make_shared<DataTypeInt32>());
+        auto null_col = ColumnUInt8::create();
+        col_type_name.column = ColumnNullable::create(ColumnInt32::create(), std::move(null_col));
+    } else {
+        col_type_name.type = std::make_shared<DataTypeInt32>();
+        col_type_name.column = ColumnInt32::create();
+    }
+}
+```
+
+### 3.3 涉及修改的文件清单
+
+| 文件 | 修改内容 |
+|------|----------|
+| `be/src/vec/exec/format/parquet/vparquet_group_reader.h` | `LazyReadContext` 添加 `lazy_dict_decode_columns`；`RowGroupReader` 新增相关方法声明 |
+| `be/src/vec/exec/format/parquet/vparquet_group_reader.cpp` | `init()` 中识别可延迟解码列；`_do_lazy_read()` Phase 2 分路径处理；新增 `_convert_lazy_dict_cols_to_string_cols()`、`_replace_column_type_to_dict_code()` |
+| `be/src/vec/exec/format/parquet/vparquet_reader.cpp` | `set_fill_columns()` 中标记候选 lazy dict decode 列 |
+
+### 3.4 StarRocks 参考
+
+- **ColumnContentType 枚举**: `utils.h:30`，`VALUE` vs `DICT_CODE`
+- **决策逻辑**: `scalar_column_reader.cpp:453-467`，`_need_lazy_decode` 基于 `_can_lazy_dict_decode && filter && selectivity < 0.2`
+- **临时列切换**: `scalar_column_reader.cpp:504-545`，`dst = _tmp_code_column` 重定向输出到 Int32 列
+- **延迟解码**: `scalar_column_reader.cpp:567-591`，`_dict_decode()` 在 `_fill_dst_column_impl` 中执行
+- **条件判断**: `scalar_column_reader.h:161-164`，`_can_lazy_dict_decode = can_lazy_decode && is_string_type() && all_pages_dict_encoded()`
+
+### 3.5 与现有 Dict Filter 的关系
+
+| 维度 | 现有 Dict Filter | 新增 Lazy Dict Decode |
+|------|------------------|----------------------|
+| **适用列** | 有 IN/EQ 谓词的 string 列 | 无谓词的 lazy string 列 |
+| **触发条件** | 谓词类型匹配 + 全字典编码 | 全字典编码 + 选择率 < 20% |
+| **机制** | 谓词改写（String → Int32 谓词） | 延迟物化（先读 codes，过滤后再转 string） |
+| **转换时机** | `_convert_dict_cols_to_string_cols` | `_convert_lazy_dict_cols_to_string_cols` |
+| **互不冲突** | 作用于 predicate columns | 作用于 lazy columns |
+
+两者可以并行工作：谓词列使用 Dict Filter，非谓词 lazy 列使用 Lazy Dict Decode。
+
+### 3.6 注意事项
+
+1. **非全字典编码的列**：Parquet 允许同一列的不同 page 使用不同编码（字典增长超限时回退到 PLAIN）。必须确认该列所有数据页都是字典编码，否则 DICT_CODE 模式会失败。
+
+2. **Converter 兼容性**：`PhysicalToLogicalConverter` 在 `is_dict_filter=true` 时跳过类型转换。需确认 lazy 列走 dict code 路径时 converter 行为正确。
+
+3. **选择率阈值**：与 P0-1 统一使用 0.2（20%）作为阈值。可通过配置参数调整。
+
+4. **内存开销**：dict code 列 (Int32) 比实际 string 列小得多，不会增加内存压力。转换发生在过滤之后，此时行数已大幅减少。
+
+---
+
+## 总结：三个 P0 优化的协同效果
+
+在一个典型的低选择率分析查询中（如 `SELECT * FROM t WHERE string_col = 'value' AND int_col > 100`，选择率 5%）：
+
+```
+原有流程：
+  1. 读 string_col 的全部 1M 行（dict decode → string copy）
+  2. 读 int_col 的全部 1M 行
+  3. 执行 filter → 存活 50K 行
+  4. 读 lazy 列的全部 1M 行
+  5. 过滤 lazy 列到 50K 行
+
+P0-1 (Filter 下推) + P0-2 (列顺序优化) + P0-3 (Lazy Dict Decode)：
+  1. 先读 string_col（选择率高的列先读）→ 50K 行存活
+  2. 带 filter 读 int_col（仅 50K 行物化）→ 45K 行存活
+  3. 读 lazy string 列为 dict codes (int32) → 仅 45K 行读取
+  4. 过滤后只对 45K 行做 dict code → string 转换
+```
+
+**估算收益**：
+- **P0-1**: dict 解码热路径减少 80% 无用 dict lookup（大字典时效果更明显）
+- **P0-2**: 第二个谓词列只需解码 5% 的行（95% 被第一列过滤）
+- **P0-3**: lazy string 列只转换 4.5% 的行，省去 95.5% 的 string copy
+
+三者叠加，在典型多列低选择率查询中可达到 **3-10x** 的纯读取层性能提升。
+
+---
+
+## P0-1 测试与验证方案
+
+### T1. 正确性验证
+
+#### T1.1 已有单元测试基线
+
+**文件**: `be/test/vec/exec/format/parquet/byte_array_dict_decoder_test.cpp`
+
+已有 10 个测试用例覆盖了以下场景：
+- `test_decode_values`: 基本字典解码
+- `test_decode_values_with_filter`: 带 filter 的解码
+- `test_decode_values_with_filter_and_null`: 带 filter + null 的解码
+- `test_decode_values_to_column_dict_i32`: 输出 dict codes 到 ColumnDictI32
+- `test_decode_values_to_column_int32`: 输出 dict codes 到 ColumnInt32
+- `test_skip_values`: 跳过值
+
+**修改后必须确保所有已有测试通过**。
+
+#### T1.2 新增 P0-1 正确性测试用例
+
+在 `byte_array_dict_decoder_test.cpp` 和 `fix_length_dict_decoder_test.cpp`（如不存在则新建）中新增以下测试：
+
+```cpp
+// 1. filter bitmap 下推 —— 低选择率场景
+TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter_bitmap_low_selectivity) {
+    // 构造 1000 行数据，只有 5% 存活（filter bitmap 中 50 个 1）
+    // 验证：输出列内容与不使用 filter bitmap 的结果完全一致
+    // 验证：CONTENT run 中 filter[i]=0 的行位置数据正确（值可以是任意的，但列长度正确）
+}
+
+// 2. filter bitmap 下推 —— 高选择率场景（不应下推）
+TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter_bitmap_high_selectivity) {
+    // 构造 1000 行数据，80% 存活
+    // 验证：selectivity > 0.2 时 filter_data 不传入 decoder
+    // 验证：结果与原有路径一致
+}
+
+// 3. filter bitmap + null 混合
+TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter_bitmap_and_nulls) {
+    // 构造含 null 的数据，filter bitmap 与 null map 交叉
+    // 验证：null 行不受 filter bitmap 影响
+    // 验证：CONTENT 中 filter[i]=1 的非 null 行正确解码
+}
+
+// 4. RleBatchDecoder::SkipBatch 正确性
+TEST_F(RleBatchDecoderTest, test_skip_batch) {
+    // 构造 RLE 编码数据（混合 RLE run + literal run）
+    // 执行 SkipBatch(n) 后继续 GetBatch()
+    // 验证：GetBatch() 返回的值与跳过后预期位置的值一致
+}
+
+// 5. BaseDictDecoder::skip_values 使用 SkipBatch
+TEST_F(ByteArrayDictDecoderTest, test_skip_values_with_skip_batch) {
+    // 跳过若干值后继续解码
+    // 验证：结果与旧实现（分配 buffer + GetBatch 丢弃）完全一致
+}
+
+// 6. 边界情况：全部被过滤
+TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter_bitmap_all_filtered) {
+    // filter bitmap 全 0
+    // 验证：不 crash，列长度正确
+}
+
+// 7. 边界情况：全部存活
+TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter_bitmap_all_pass) {
+    // filter bitmap 全 1
+    // 验证：结果与无 filter bitmap 完全一致
+}
+```
+
+#### T1.3 FixLengthDictDecoder 的对应测试
+
+在 `fix_length_dict_decoder_test.cpp` 中新增类似测试，覆盖 INT32/INT64/FLOAT/DOUBLE 等定长类型的 filter bitmap 下推。
+
+### T2. 性能验证
+
+#### T2.1 方法一：Profile Counters（最简单）
+
+Doris 已有 Query Profile 机制，相关计数器定义在 `be/src/vec/exec/format/parquet/vparquet_reader.h`：
+
+```
+decode_value_time    — Decoder 解码耗时（核心指标）
+column_read_time     — 列读取总耗时
+decode_dict_time     — 字典解码耗时
+predicate_filter_time — 谓词过滤耗时
+lazy_read_filtered_rows — 懒加载跳过行数
+```
+
+**测试步骤**：
+
+```sql
+-- 1. 准备测试表：字典编码 string 列 + 低选择率谓词
+CREATE TABLE test_parquet_filter AS
+SELECT * FROM parquet_file("path/to/large_dict_file.parquet");
+
+-- 2. 关闭优化，记录 baseline
+SET parquet_push_down_filter_to_decoder_enable = false;
+SELECT count(*) FROM test_parquet_filter WHERE string_col = 'rare_value';
+-- 查看 Profile 中 decode_value_time
+
+-- 3. 开启优化，对比
+SET parquet_push_down_filter_to_decoder_enable = true;
+SELECT count(*) FROM test_parquet_filter WHERE string_col = 'rare_value';
+-- 查看 Profile 中 decode_value_time
+```
+
+**预期**：`decode_value_time` 在低选择率（< 20%）场景下降低 30-80%。
+
+#### T2.2 方法二：Microbenchmark（最精确）
+
+新建 `be/test/vec/exec/format/parquet/decoder_benchmark.cpp`，使用 Google Benchmark 框架：
+
+```cpp
+#include <benchmark/benchmark.h>
+
+// 测试矩阵：dict_size × selectivity × type
+// dict_size: 100, 1000, 10000, 100000（模拟 L2 cache 内/外）
+// selectivity: 0.01, 0.05, 0.1, 0.2, 0.5, 1.0
+// type: INT32, INT64, STRING
+
+static void BM_DictDecode_NoFilter(benchmark::State& state) {
+    int dict_size = state.range(0);
+    double selectivity = state.range(1) / 100.0;
+    // 构造 dict decoder + 1M 行 RLE 数据
+    // 构造 ColumnSelectVector（有 FILTERED_CONTENT runs）
+    for (auto _ : state) {
+        // 调用 decode_values(..., filter_data = nullptr)
+    }
+    state.SetItemsProcessed(state.iterations() * 1000000);
+}
+
+static void BM_DictDecode_WithFilter(benchmark::State& state) {
+    int dict_size = state.range(0);
+    double selectivity = state.range(1) / 100.0;
+    // 同上，但传入 filter_data
+    for (auto _ : state) {
+        // 调用 decode_values(..., filter_data = bitmap)
+    }
+    state.SetItemsProcessed(state.iterations() * 1000000);
+}
+
+// 测试矩阵
+BENCHMARK(BM_DictDecode_NoFilter)
+    ->Args({100, 5})     // 小字典, 5% 选择率
+    ->Args({100, 50})    // 小字典, 50% 选择率
+    ->Args({100000, 5})  // 大字典, 5% 选择率
+    ->Args({100000, 50}); // 大字典, 50% 选择率
+
+BENCHMARK(BM_DictDecode_WithFilter)
+    ->Args({100, 5})
+    ->Args({100, 50})
+    ->Args({100000, 5})
+    ->Args({100000, 50});
+```
+
+**预期结果矩阵**：
+
+| 字典大小 | 选择率 | WithFilter vs NoFilter |
+|---------|--------|----------------------|
+| 100（L2 内）| 5% | 持平或略优（dict lookup 本身很快） |
+| 100（L2 内）| 50% | 持平（不应下推） |
+| 100K（L2 外）| 5% | **显著提升 3-5x**（减少大量 cache miss） |
+| 100K（L2 外）| 50% | 略有提升 |
+
+#### T2.3 方法三：端到端 SQL 测试（最贴近生产）
+
+准备测试数据集：
+
+```bash
+# 生成测试 Parquet 文件
+# - 10M 行
+# - string_col: 字典编码，字典大小 50000（超过 L2 cache）
+# - int_col: 普通 INT32
+# - 谓词 string_col = 'value_42' 选择率约 0.002%
+
+python3 generate_test_parquet.py \
+    --rows 10000000 \
+    --dict-size 50000 \
+    --output /path/to/test_large_dict.parquet
+```
+
+**测试 SQL**：
+
+```sql
+-- Case 1: 低选择率 string 谓词（最大收益场景）
+SELECT count(*), sum(int_col)
+FROM parquet_file("/path/to/test_large_dict.parquet")
+WHERE string_col = 'value_42';
+
+-- Case 2: 多列低选择率谓词
+SELECT count(*)
+FROM parquet_file("/path/to/test_large_dict.parquet")
+WHERE string_col IN ('value_1', 'value_2', 'value_3')
+  AND int_col > 900000;
+
+-- Case 3: 高选择率谓词（应无差异，验证不退化）
+SELECT count(*)
+FROM parquet_file("/path/to/test_large_dict.parquet")
+WHERE int_col > 0;  -- 几乎全部存活
+```
+
+### T3. 关键观测指标
+
+| 指标 | 获取方式 | 预期变化 |
+|------|---------|---------|
+| `decode_value_time` | Query Profile | 低选择率场景降低 30-80% |
+| `column_read_time` | Query Profile | 随 decode_value_time 降低 |
+| 查询总延迟 | SQL 客户端 | 取决于 decode 在总耗时中的占比 |
+| L2 cache miss | `perf stat -e cache-misses` | 大字典场景显著降低 |
+| 内存分配 | `skip_values` 路径 | 消除 `_indexes.resize()` 分配 |
+
+### T4. 验证执行顺序
+
+1. **单元测试**（T1）：实现后第一时间运行，确保功能正确
+   ```bash
+   cd be && ./run_ut.sh --test ByteArrayDictDecoderTest
+   cd be && ./run_ut.sh --test FixLengthDictDecoderTest
+   ```
+
+2. **Microbenchmark**（T2.2）：确认性能数据符合预期
+   ```bash
+   cd be && ./run_benchmark.sh decoder_benchmark
+   ```
+
+3. **回归测试**：运行完整 Parquet 读取相关回归测试
+   ```bash
+   cd regression-test && ./run.sh -s external_table_p0/parquet
+   ```
+
+4. **端到端 SQL**（T2.3）：在测试环境中执行，对比 Profile
+
+5. **（可选）perf stat**：验证 cache miss 降低
+   ```bash
+   perf stat -e cache-references,cache-misses,L1-dcache-load-misses \
+       doris_be --query "SELECT count(*) FROM ... WHERE ..."
+   ```
+
+### T5. 新增 Profile Counter（建议）
+
+为更精确追踪 P0-1 的效果，建议在 `ReaderStatistics` 中新增计数器：
+
+```cpp
+// be/src/vec/exec/format/parquet/vparquet_reader.h
+struct ReaderStatistics {
+    // ... 现有计数器 ...
+
+    // P0-1 新增
+    int64_t filter_bitmap_pushdown_count = 0;   // filter bitmap 下推次数
+    int64_t filter_bitmap_skipped_lookups = 0;  // 跳过的 dict lookup 次数
+    int64_t rle_skip_batch_count = 0;           // SkipBatch 调用次数
+};
+```
+
+对应的 Profile 名称：
+- `FilterBitmapPushdownCount`
+- `FilterBitmapSkippedLookups`
+- `RLESkipBatchCount`
+
+这些计数器可以在 Query Profile 中直观展示优化的触发频率和效果。
diff --git a/docs/Parquet_Reader_Performance_Optimization_Analysis.md b/docs/Parquet_Reader_Performance_Optimization_Analysis.md
new file mode 100644
index 00000000000000..9b45714a545c35
--- /dev/null
+++ b/docs/Parquet_Reader_Performance_Optimization_Analysis.md
@@ -0,0 +1,442 @@
+# Doris Parquet Reader 纯读取层性能优化方向分析
+
+> 对比 DuckDB 与 StarRocks 的 Parquet Reader 实现，从纯读取层角度分析 Doris 的优化方向。
+
+---
+
+## 一、三者架构总览
+
+| 维度 | Doris | DuckDB | StarRocks |
+|------|-------|--------|-----------|
+| 入口类 | `ParquetReader` → `RowGroupReader` → `ScalarColumnReader<IN_COL,OFF_IDX>` | `ParquetScanFunction` → `ParquetReader` → `ColumnReader` | `FileReader` → `GroupReader` → `ScalarColumnReader` |
+| 解码器 | 自研 Decoder 体系 (Plain/Dict/Delta/BSS/RLE) | 自研模板化 Decoder (高度类型特化) | 自研 Decoder 体系 + SIMD intrinsics |
+| IO 层 | `BufferedFileStreamReader` + `MergeRangeFileReader` | `BufferedFileReader` + 自适应 prefetch | `SharedBufferedInputStream` (全局 IO coalescing) |
+| 向量化 | `ColumnSelectVector` run-length 批处理 | DuckDB Vector (2048 batch) 原生向量化 | 模板特化 + AVX2 SIMD + branchless |
+| 延迟物化 | 2 级 (谓词列 vs lazy 列) | 依赖执行引擎的 filter pushdown | 4 级 (列分组 + lazy dict + lazy convert + filter→decoder) |
+
+### 关键源码位置
+
+**Doris:**
+- `be/src/vec/exec/format/parquet/vparquet_reader.h` — `ParquetReader` 主入口
+- `be/src/vec/exec/format/parquet/vparquet_group_reader.h` — `RowGroupReader`
+- `be/src/vec/exec/format/parquet/vparquet_column_reader.h` — `ScalarColumnReader<IN_COL, OFF_IDX>`
+- `be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h` — `ColumnChunkReader`
+- `be/src/vec/exec/format/parquet/decoder.h` — Decoder 基类
+- `be/src/vec/exec/format/parquet/parquet_common.h` — `ColumnSelectVector` / `FilterMap`
+
+**DuckDB:**
+- `extension/parquet/include/parquet_reader.hpp` — `ParquetReader`
+- `extension/parquet/include/column_reader.hpp` — `ColumnReader` 基类 + `PlainTemplatedInternal`
+- `extension/parquet/include/reader/templated_column_reader.hpp` — 模板化列读取器
+- `extension/parquet/include/decoder/dictionary_decoder.hpp` — 字典解码器
+- `extension/parquet/include/decode_utils.hpp` — bitpack/zigzag/varint 工具
+
+**StarRocks:**
+- `be/src/formats/parquet/file_reader.h` — `FileReader`
+- `be/src/formats/parquet/group_reader.h` — `GroupReader`
+- `be/src/formats/parquet/scalar_column_reader.h` — `ScalarColumnReader`
+- `be/src/formats/parquet/stored_column_reader.h` — `StoredColumnReaderImpl`
+- `be/src/formats/parquet/encoding_dict.h` — `CacheAwareDictDecoder` + AVX2
+- `be/src/formats/parquet/encoding_plain.h` — Plain 解码 + SIMD
+- `be/src/formats/parquet/column_read_order_ctx.h` — 列读取顺序优化
+
+---
+
+## 二、逐层对比分析
+
+### 1. 谓词下推 & Row Group 过滤
+
+#### Doris 现状
+
+三级漏斗，在 `ParquetReader::_next_row_group_reader()` (`vparquet_reader.cpp:743`) 中编排：
+
+1. **Range 对齐检查**：`_is_misaligned_range_group()` (line 900) — 检查 row group 中点是否在分配的 scan range 内
+2. **Row Group 级 Min/Max + Bloom Filter**：`_process_column_stat_filter()` (line 1171) — 逐列评估 min/max 统计值，同列多谓词共享 bloom filter 缓存
+3. **Page Index**：`_process_page_index_filter()` (line 914) — 读取 Column Index 做页级 min/max 过滤，产出 `RowRanges`
+
+#### DuckDB 优势
+
+- **Zone Map 与 filter 框架统一**：`ParquetStatisticsUtils` 做类型感知的统计比较，与 DuckDB filter pushdown 框架紧密集成
+- **自适应 prefetch 策略**：`disable_parquet_prefetching` / `prefetch_all_parquet_files` 两个开关，根据文件类型（本地 vs 远程）自动选择预取策略
+- **Metadata cache**：`parquet_metadata_cache` 选项，支持跨查询缓存 metadata，避免重复解析同一文件 footer
+
+#### StarRocks 优势
+
+- **Runtime Filter 动态 Row Group 剪裁**：`RuntimeScanRangePruner` (`file_reader.cpp:358-373`) 在扫描过程中，当新的 runtime filter 到达时，通过 `_update_rf_and_filter_group()` 动态跳过尚未读取的 row group。Doris 的 runtime filter 在 scan 开始前就已确定，缺乏这种动态能力
+- **Bloom Filter 自适应 IO 决策**：`adaptive_judge_if_apply_bloom_filter(span_size)` (`column_reader.h:202`) 根据数据量判断 bloom filter IO 是否值得
+- **统一的 `PredicateFilterEvaluator`**：visitor 模式遍历 `PredicateTree`，同时派发 zone map / page index / bloom filter 三种过滤，架构更清晰
+
+> **→ 优化方向 1：Runtime Filter 动态 Row Group 剪裁**
+>
+> Join 查询中 build 端完成后，probe 端扫描过程中动态跳过不满足条件的 row groups，避免无用 IO 和解码。
+
+---
+
+### 2. 解码层优化
+
+#### Doris 现状
+
+`decoder.h:50-92`，`vparquet_column_reader.cpp:321`：
+
+- `ColumnSelectVector` 将 null map + filter map 编码为 RLE 流 (CONTENT / NULL_DATA / FILTERED_CONTENT / FILTERED_NULL)，decoder 按 run 批量处理
+- `BaseDictDecoder::_decode_dict_values<has_filter>()` 模板化 filter 分支
+- `ScalarColumnReader<IN_COLLECTION, OFFSET_INDEX>` 四重模板特化消除嵌套/索引分支
+- **无任何 SIMD intrinsics**
+
+#### DuckDB 优势
+
+- **四重模板特化的 Plain 解码**：`PlainTemplatedInternal<VALUE_TYPE, CONVERSION, HAS_DEFINES, CHECKED>` 生成 4 条编译时路径，无 NULL + 类型大小匹配时退化为单次 `memcpy`：
+
+```cpp
+// column_reader.hpp:218-224
+if (!HAS_DEFINES && !CHECKED && CONVERSION::PlainConstantSize() == sizeof(VALUE_TYPE)) {
+    idx_t copy_count = num_values * CONVERSION::PlainConstantSize();
+    memcpy(result_ptr + result_offset, plain_data.ptr, copy_count);
+    plain_data.unsafe_inc(copy_count);
+    return;
+}
+```
+
+- **String dictionary zero-copy**：直接引用 dict buffer 中的数据，通过 `StringHeap` 管理生命周期，避免 memcpy
+- **直接写入 DuckDB Vector（2048 行）**：无中间格式转换
+
+#### StarRocks 优势 — 多处 AVX2 SIMD 加速
+
+**(a) FLBA 向量化 Slice 构造** (`encoding_plain.h:586-605`)：
+
+```cpp
+#ifdef __AVX2__
+// 每次迭代处理 4 个 Slice，用 256-bit 寄存器批量构造
+__m256i fixed_length = _mm256_set1_epi64x(_type_length);
+__m256i inc = _mm256_set1_epi64x(_type_length * 4);
+// shuffle + store 4 Slices at once
+#endif
+```
+
+**(b) Dictionary Decoder 的 AVX2 Null 处理** (`encoding_dict.h:146-172`)：
+
+```cpp
+#ifdef __AVX2__
+// 稀疏 null 列（非空率 < 10%）用 AVX2 扫描 null bitmap
+__m256i loaded = _mm256_loadu_si256((__m256i*)&nulls[i]);
+int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(loaded, _mm256_setzero_si256()));
+// 用 phmap BitMask 迭代 set bits，scatter 非空值到正确位置
+#endif
+// 稠密路径用 SIMD::Expand::expand_load()
+```
+
+**(c) Branchless Null 处理** (`encoding_dict.h:469-473`)：
+
+```cpp
+uint32_t mask = ~(static_cast<uint32_t>(-null_data_ptr[i]));
+int32_t code = mask & dict_codes[i]; // 无分支选择
+```
+
+**(d) `append_strings_overflow()` SIMD 安全读取**：允许读取超出字符串边界最多 `APPEND_OVERFLOW_MAX_SIZE` 字节，避免 SIMD 边界检查开销。
+
+> **→ 优化方向 2：SIMD 加速解码**
+>
+> Doris 的解码器完全没有 SIMD 优化。可参考 StarRocks 实现：
+> - Dict 解码中的 null bitmap AVX2 扫描 + scatter
+> - FLBA 向量化 Slice 构造
+> - Branchless null 处理模式
+
+---
+
+### 3. Filter 下推到 Decoder 内部
+
+#### Doris 现状
+
+- `ColumnSelectVector` 在 decoder 外层将 filter map 编码为 FILTERED_CONTENT run，decoder 内部逐 run 调用 `skip_values()` 跳过被过滤的值
+- 问题：即使是 skip，也需要 RLE 解码 dict codes 来推进位置，开销不小
+
+#### StarRocks 实现
+
+`stored_column_reader.h:155-161`：
+
+```cpp
+const FilterData* _convert_filter_row_to_value(const Filter* filter, size_t row_readed) {
+    if (!filter || !config::parquet_push_down_filter_to_decoder_enable) return nullptr;
+    // 选择率 < 20% 时，直接传 filter bitmap 给 decoder
+    return SIMD::count_nonzero(*filter) * 1.0 / filter->size() < 0.2
+        ? filter->data() + row_readed : nullptr;
+}
+```
+
+当选择率 < 20% 时，filter bitmap 直接传入 `Decoder::next_batch(count, content_type, dst, filter)`，decoder 内部跳过被过滤值的物化（不执行 dict lookup、不执行 string copy），比 Doris 的外层 skip 更高效。
+
+#### DuckDB 实现
+
+DuckDB 在 dictionary 初始化时一次性评估 filter，标记每个 dict entry 是否满足条件。后续如果一个页面的所有 dict entries 都被过滤（`HasFilteredOutAllValues()`），则整页直接跳过，连 RLE 解码都不做。
+
+> **→ 优化方向 3：Filter 下推到 Decoder 层**
+>
+> 在低选择率场景（< 20%），直接将 filter bitmap 传给 decoder，decoder 内部跳过被过滤值的物化。与 Doris 现有的 `FILTERED_CONTENT` run 机制相比，省去了 "先解码 → 再 skip" 的开销。
+
+---
+
+### 4. Cache-Aware 字典解码
+
+#### Doris 现状
+
+无任何 cache 感知的解码策略。
+
+#### StarRocks 实现
+
+`encoding_dict.h:91-127` `CacheAwareDictDecoder`：
+
+```cpp
+CacheAwareDictDecoder() { _dict_size_threshold = CpuInfo::get_l2_cache_size(); }
+
+Status next_batch(size_t count, ColumnContentType content_type, Column* dst,
+                  const FilterData* filter) {
+    // ...
+    if (_get_dict_size() > _dict_size_threshold &&
+        config::parquet_cache_aware_dict_decoder_enable) {
+        return _next_batch_value(count, dst, filter);  // 传入 filter，跳过无用 lookup
+    } else {
+        return _next_batch_value(count, dst, nullptr);  // 不传 filter，直接 lookup
+    }
+}
+```
+
+核心逻辑：
+- 字典 > L2 cache → 随机 dict lookup 产生大量 cache miss → 传入 filter bitmap 跳过无用 lookup
+- 字典 < L2 cache → lookup 基本都是 cache hit → 传 filter 反而增加判断开销
+
+> **→ 优化方向 4：Cache-Aware 字典解码**
+>
+> 大字典（> L2 cache 大小，通常 256KB-1MB）的 dict lookup 是 cache-unfriendly 的热点操作。结合 filter bitmap 跳过无用 lookup 可以显著减少 L2 cache miss。
+
+---
+
+### 5. 延迟物化（Late Materialization）
+
+#### Doris 现状
+
+`vparquet_group_reader.cpp:518` `_do_lazy_read()`：
+
+- **2 级**：predicate columns（先读） + lazy columns（后读，带 filter map）
+- `_cached_filtered_rows` 跨 batch 累积，允许跳过整页 lazy 列
+- `filter_ratio > 0.6` 时触发整页跳过优化
+
+#### DuckDB
+
+- 不在 reader 内部做 late materialization，依赖执行引擎的 filter + projection pushdown
+- `AdaptiveFilter` 动态重排 filter 执行顺序（filter 级别而非列级别）
+
+#### StarRocks — 4 级延迟物化
+
+| 层级 | 机制 | 位置 | 说明 |
+|------|------|------|------|
+| L1 | Active vs Lazy 列分离 | `GroupReader._active_column_indices` / `_lazy_column_indices` | 与 Doris 类似，谓词列先读，非谓词列后读 |
+| L2 | **Lazy Dictionary Decode** | `ScalarColumnReader._can_lazy_dict_decode` (`scalar_column_reader.h:162`) | string 类型 + 全页字典编码时，先只读 dict codes (int32)，filter 后仅对存活行做 dict lookup → string 物化 |
+| L3 | **Lazy Type Conversion** | `ScalarColumnReader._can_lazy_convert` | 先以 Parquet 原生类型读取（如 INT96），filter 后仅对存活行做类型转换（如 INT96→DateTime） |
+| L4 | Filter 下推到 Decoder | `_convert_filter_row_to_value()` | 选择率 < 20% 时直接跳过值物化 |
+
+**关键细节：自适应阈值** — Lazy Dict Decode 仅在 `FILTER_RATIO < 0.2` 时启用 (`scalar_column_reader.h:215`)，避免低选择率时增加无用的中间步骤。
+
+> **→ 优化方向 5：引入 Lazy Dict Decode + Lazy Type Conversion**
+>
+> - **Lazy Dict Decode**：对 string 类型字典编码列，先只读 dict codes (int32)，filter 后仅对存活行做字典 lookup。在高过滤率场景，省去大量 string copy
+> - **Lazy Type Conversion**：先以 Parquet 物理类型读取，filter 后仅对存活行做类型转换（如 INT96→DateTime、FLBA→Decimal）
+
+---
+
+### 6. 列读取顺序优化
+
+#### Doris 现状
+
+lazy read 只区分 "谓词列" 和 "非谓词列" 两组，两组内部无排序。
+
+#### StarRocks 实现
+
+`column_read_order_ctx.h:24-54`：
+
+```cpp
+class ColumnReadOrderCtx {
+    std::vector<int> _column_indices;       // 最优顺序
+    std::vector<int> _trying_column_indices; // 当前尝试的顺序
+    size_t _min_round_cost = 0;             // 最小 round cost
+    size_t _rand_round_order_index = 10;    // 从 10 个随机顺序中选最优
+    std::unordered_map<int, size_t> _column_cost_map; // 列 → cost
+};
+```
+
+- `_read_range_round_by_round()` (`group_reader.h:173`) 按轮次读列，每轮之间可应用 filter
+- `update_ctx(round_cost, first_selectivity)` 动态更新列读取顺序
+- 从 10 个随机顺序中选择 cost 最低的排列，实现自适应优化
+- 高选择率谓词列先读 → 产生 filter → 后续列在更少行上物化
+
+#### DuckDB 实现
+
+`AdaptiveFilter` (`parquet_reader.cpp:1432-1452`) 运行时动态重排 filter 执行顺序，粒度是 filter 级别：
+
+```cpp
+auto filter_state = state.adaptive_filter->BeginFilter();
+for (idx_t i = 0; i < state.scan_filters.size(); i++) {
+    auto &scan_filter = state.scan_filters[state.adaptive_filter->permutation[i]];
+    // ... evaluate filter ...
+}
+state.adaptive_filter->EndFilter(filter_state);
+```
+
+> **→ 优化方向 6：列读取顺序优化**
+>
+> 在谓词列内部按选择率排序：先读选择率高（过滤效果好）的列，产生 filter 后再读其他列。最大化 filter 效果，减少后续列的解码量。
+
+---
+
+### 7. IO 模式
+
+#### Doris 现状
+
+- `BufferedFileStreamReader`：每列独立的顺序预读缓冲
+- `MergeRangeFileReader`：平均 IO < `SMALL_IO` 阈值时，合并邻近小 IO
+- 两者互斥（有 MergeRange 时禁用 Buffered prefetch，避免双缓冲）
+- `StoragePageCache`：LRU 页面缓存，支持压缩/解压两种缓存策略
+- `FileMetaCache`：footer 缓存
+
+#### DuckDB 优势
+
+- **自适应 prefetch**：根据文件存储类型（本地 vs 远程 S3/HTTP）自动调整 prefetch 策略
+- **整 Row Group prefetch**：当扫描 > 95% 列且无 filter 时，一次性预取整个 row group 数据范围
+- **列级 prefetch 与 lazy fetch 协作**：有 filter 时，filter 列立即预取，非 filter 列延迟预取（`allow_merge=false`）
+- **Metadata cache 独立配置**：`parquet_metadata_cache` 允许跨查询缓存 metadata
+
+#### StarRocks 优势
+
+- **`SharedBufferedInputStream` 全局 IO Coalescing**：row group 内所有列共享同一个缓冲输入流，统一收集所有列的 IO ranges 后全局合并
+- **分类型 IO 收集**：区分 `PAGES` / `PAGE_INDEX` / `BLOOM_FILTER` 三种 IO 类型，分别收集和调度
+- **Lazy Column IO 延迟合并**：`lazy_column_coalesce_counter` (`group_reader.h:98`) 追踪是否需要将 lazy 列 IO 与 active 列合并，避免预读永远不会被使用的 lazy 列数据
+- **DataCache 集成**：与 StarRocks 的分布式缓存系统集成
+
+> **→ 优化方向 7：统一的 IO Coalescing**
+>
+> Doris 的 `MergeRangeFileReader` 只做简单的邻近 IO 合并。StarRocks 的全局 IO coalescing 跨列统一优化，对远程存储（S3/HDFS）场景可以显著减少 IO 次数。且区分 active/lazy 列的 IO 策略更精细。
+
+---
+
+### 8. Page Index 利用
+
+#### Doris 现状
+
+- 支持 Offset Index（页级定位）和 Column Index（页级 min/max）
+- `OFFSET_INDEX=true` 模板参数启用直接页面寻址，消除运行时开销
+- `_process_page_index_filter()` 利用 Column Index 做页级行范围过滤
+
+#### StarRocks
+
+- `StoredColumnReaderWithIndex`：专门的带索引读取器
+- `_next_selected_page()` 直接跳到下一个选中的页面
+- 与 Zone Map Filter 统一流程：`page_index_zone_map_filter()` 返回 `SparseRange<uint64_t>`，与 row group 级过滤结果直接交集
+
+#### DuckDB
+
+- **不支持 Page Index**（ColumnIndex / OffsetIndex）。无法做页级行范围过滤。在这一点上 Doris 和 StarRocks 均领先。
+
+> **该方面 Doris 已有较好实现**，通过模板参数消除了运行时开销。
+
+---
+
+### 9. 编码支持完整度
+
+| 编码 | Doris | DuckDB | StarRocks |
+|------|-------|--------|-----------|
+| PLAIN | ✅ | ✅ | ✅ |
+| RLE_DICTIONARY | ✅ | ✅ | ✅ |
+| RLE (Boolean) | ✅ | ✅ | ✅ |
+| DELTA_BINARY_PACKED | ✅ | ✅ | ✅ |
+| DELTA_BYTE_ARRAY | ✅ | ✅ | ✅ |
+| DELTA_LENGTH_BYTE_ARRAY | ✅ | ✅ | ✅ |
+| BYTE_STREAM_SPLIT | ✅ | ✅ | ✅ |
+
+三者编码支持基本对齐，差异不大。
+
+---
+
+### 10. 字典过滤优化
+
+#### Doris 现状
+
+`RowGroupReader::_rewrite_dict_predicates()` (`vparquet_group_reader.cpp:1042`)：
+
+1. 读取字典值到 string column
+2. 构建临时 Block，执行 conjuncts 过滤
+3. 全部命中则跳过整个 row group（`_is_row_group_filtered = true`）
+4. 部分命中则改写为 dict code 上的 `EQ` / `IN` 谓词，避免后续 string 比较
+5. 有上限：`MAX_DICT_CODE_PREDICATE_TO_REWRITE`，超过则退回原始谓词
+6. 读取后需 `_convert_dict_cols_to_string_cols()` 将 dict codes 转回字符串
+
+#### DuckDB
+
+- 字典解码更轻量：直接在 dictionary buffer 上做 lookup，string 结果引用 dict buffer（zero-copy）
+- `DictionaryDecoder::InitializeDictionary()` 接受 filter，一次性评估所有 dict entries
+
+#### StarRocks
+
+- 四级 Lazy 机制配合字典过滤（见上文第 5 节）
+- 自适应 Lazy Decode 阈值：`FILTER_RATIO = 0.2`
+- L2 Cache 感知：`CacheAwareDictDecoder`（见上文第 4 节）
+- Struct 子字段级字典过滤下推：`StructColumnReader` 通过 `sub_field_path` 路由字典过滤到子 reader
+
+---
+
+## 三、Doris 现有优势
+
+对比之下，Doris 也有自身的亮点：
+
+1. **模板四重特化**：`ScalarColumnReader<IN_COLLECTION, OFFSET_INDEX>` × `ColumnChunkReader` × `PageReader` 各 4 个实例化（共 12 个），消除了嵌套列处理和 offset index 的运行时分支
+
+2. **ColumnSelectVector run-length 批处理**：将 null map + filter map 编码为 run-length 流，decoder 按 run 批量处理，比逐行判断高效
+
+3. **Page Index 完整支持**：支持 Offset Index + Column Index，通过模板参数消除运行时开销（DuckDB 不支持 Page Index）
+
+4. **Page Cache 两级策略**：根据压缩比选择缓存压缩数据还是解压数据，平衡内存占用和 CPU 开销
+
+5. **MergeRangeFileReader 与 BufferedFileStreamReader 互斥**：避免双缓冲浪费
+
+---
+
+## 四、总结：优化方向优先级排序
+
+### P0 — 高收益，改动可控
+
+| # | 优化方向 | 参考实现 | 核心收益 |
+|---|---------|---------|---------|
+| 1 | **Filter bitmap 下推到 Decoder** | StarRocks `stored_column_reader.h:155-161` | 低选择率查询（< 20% 存活）减少 60-80% 无用值物化 |
+| 2 | **谓词列读取顺序优化** | StarRocks `ColumnReadOrderCtx` / DuckDB `AdaptiveFilter` | 多谓词列查询，最大化 filter 裁剪效果，减少后续列解码量 |
+| 3 | **Lazy Dictionary Decode** | StarRocks `ScalarColumnReader._can_lazy_dict_decode` | 字典编码 string 列 + 高过滤率时省去大量 string copy |
+
+### P1 — 中等收益
+
+| # | 优化方向 | 参考实现 | 核心收益 |
+|---|---------|---------|---------|
+| 4 | **AVX2 SIMD 解码热路径** | StarRocks `encoding_dict.h` null scatter/expand | CPU-bound 场景整体解码加速 |
+| 5 | **Cache-Aware 字典解码** | StarRocks `CacheAwareDictDecoder` (L2 cache check) | 大字典（> L2 cache）场景减少 cache miss |
+| 6 | **Plain 编码 memcpy 快速路径** | DuckDB `PlainTemplatedInternal` 四重模板 | 无 NULL 定长列整批 memcpy，消除逐值处理 |
+| 7 | **全局 IO Coalescing** | StarRocks `SharedBufferedInputStream` | 远程存储（S3/HDFS）多列查询减少 IO 次数 |
+
+### P2 — 长期优化
+
+| # | 优化方向 | 参考实现 | 核心收益 |
+|---|---------|---------|---------|
+| 8 | **Runtime Filter 动态 Row Group 剪裁** | StarRocks `RuntimeScanRangePruner` | Join 查询中 build 端完成后动态跳过 probe 端 row groups |
+| 9 | **Lazy Type Conversion** | StarRocks `_can_lazy_convert` | INT96→DateTime 等需类型转换列 + filter 场景 |
+| 10 | **String Zero-Copy Dict Lookup** | DuckDB `StringHeap` 引用 dict buffer | 字典编码 string 列减少 memcpy 开销 |
+
+---
+
+## 五、核心结论
+
+Doris 的 Parquet Reader 架构设计合理，模板四重特化和 `ColumnSelectVector` run-length 批处理是其亮点。但与 StarRocks 对比，在三个关键维度存在明显差距：
+
+1. **Decoder 层精细度**：StarRocks 的 filter→decoder 下推 + cache-aware dict + SIMD intrinsics，使得解码热路径效率显著更高。Doris 的 decoder 没有任何 SIMD，也不接收 filter bitmap。
+
+2. **延迟物化深度**：Doris 2 级 vs StarRocks 4 级。差距主要在 dict decode 和 type convert 两个环节的延迟物化 — StarRocks 可以先读 dict codes (int32)，filter 后仅对存活行做字典 lookup 和类型转换。
+
+3. **列间协作**：StarRocks 的 `ColumnReadOrderCtx` 在谓词列之间做顺序优化（高选择率列先读），DuckDB 也有 `AdaptiveFilter` 动态重排。Doris 缺乏谓词列间的排序优化。
+
+与 DuckDB 对比，Doris 在 Page Index 支持上领先（DuckDB 不支持），但 DuckDB 在 Plain 解码的 memcpy 快速路径和 String 零拷贝字典引用上有优势。
+
+**最大的性能杠杆在 P0 三项** — 不需要大规模重构架构，但能在典型分析查询（低选择率 + 多谓词列 + 字典编码 string）中带来显著提升。
diff --git a/run-be-benchmark.sh b/run-be-benchmark.sh
new file mode 100755
index 00000000000000..7265ab85b90a10
--- /dev/null
+++ b/run-be-benchmark.sh
@@ -0,0 +1,390 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#####################################################################
+# This script is used to build and run Google Benchmark of Doris Backend.
+# Usage: $0 <options>
+#  Optional options:
+#     --clean              clean and rebuild benchmark
+#     --run                build and run benchmark
+#     --run --filter=xx    build and run specified benchmark(s)
+#     -j                   build parallel
+#     -h                   print this help message
+#
+# Benchmark requires RELEASE build type.
+# The build directory is: be/build_benchmark/
+#####################################################################
+
+set -eo pipefail
+set +o posix
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+
+export ROOT
+export DORIS_HOME="${ROOT}"
+
+. "${DORIS_HOME}/env.sh"
+
+# Check args
+usage() {
+    echo "
+Usage: $0 <options>
+  Optional options:
+     --clean              clean and rebuild benchmark
+     --run                build and run benchmark
+     --run --filter=xx    build and run specified benchmark(s) (Google Benchmark --benchmark_filter)
+     -j                   build parallel
+     -h                   print this help message
+
+  Eg.
+    $0                                           build benchmark only
+    $0 --run                                     build and run all benchmarks
+    $0 --run --filter=BM_ByteArrayDictDecode.*   build and run matching benchmarks
+    $0 --clean                                   clean and rebuild benchmark
+    $0 --clean --run                             clean, rebuild and run all benchmarks
+    $0 -j 16 --run                               build with 16 jobs and run
+  "
+    exit 1
+}
+
+if ! OPTS="$(getopt -n "$0" -o hj: -l run,clean,filter: -- "$@")"; then
+    usage
+fi
+
+eval set -- "${OPTS}"
+
+CLEAN=0
+RUN=0
+FILTER=""
+PARALLEL=""
+if [[ "$#" != 1 ]]; then
+    while true; do
+        case "$1" in
+        --clean)
+            CLEAN=1
+            shift
+            ;;
+        --run)
+            RUN=1
+            shift
+            ;;
+        --filter)
+            FILTER="$2"
+            shift 2
+            ;;
+        -j)
+            PARALLEL="$2"
+            shift 2
+            ;;
+        -h)
+            usage
+            ;;
+        --)
+            shift
+            break
+            ;;
+        *)
+            usage
+            ;;
+        esac
+    done
+fi
+
+if [[ -z "${PARALLEL}" ]]; then
+    PARALLEL="$(($(nproc) / 4 + 1))"
+fi
+
+# Benchmark requires RELEASE build type
+CMAKE_BUILD_TYPE="RELEASE"
+CMAKE_BUILD_DIR="${DORIS_HOME}/be/build_benchmark"
+
+echo "Get params:
+    PARALLEL            -- ${PARALLEL}
+    CLEAN               -- ${CLEAN}
+    RUN                 -- ${RUN}
+    FILTER              -- ${FILTER}
+    CMAKE_BUILD_TYPE    -- ${CMAKE_BUILD_TYPE}
+    CMAKE_BUILD_DIR     -- ${CMAKE_BUILD_DIR}
+    ENABLE_PCH          -- ${ENABLE_PCH}
+"
+echo "Build Backend Benchmark"
+
+# Update submodules (same as run-be-ut.sh)
+update_submodule() {
+    local submodule_path=$1
+    local submodule_name=$2
+    local archive_url=$3
+
+    set +e
+    cd "${DORIS_HOME}"
+    echo "Update ${submodule_name} submodule ..."
+    git submodule update --init --recursive "${submodule_path}"
+    exit_code=$?
+    set -e
+    if [[ "${exit_code}" -ne 0 ]]; then
+        submodule_commit=$(git ls-tree HEAD "${submodule_path}" | awk '{print $3}')
+        commit_specific_url=$(echo "${archive_url}" | sed "s/refs\/heads/${submodule_commit}/")
+        echo "Update ${submodule_name} submodule failed, start to download and extract ${commit_specific_url}"
+        mkdir -p "${DORIS_HOME}/${submodule_path}"
+        curl -L "${commit_specific_url}" | tar -xz -C "${DORIS_HOME}/${submodule_path}" --strip-components=1
+    fi
+}
+
+# Update submodules only if they are not initialized yet
+if [[ ! -f "${DORIS_HOME}/contrib/apache-orc/CMakeLists.txt" ]]; then
+    update_submodule "contrib/apache-orc" "apache-orc" "https://github.com/apache/doris-thirdparty/archive/refs/heads/orc.tar.gz"
+fi
+if [[ ! -f "${DORIS_HOME}/contrib/clucene/CMakeLists.txt" ]]; then
+    update_submodule "contrib/clucene" "clucene" "https://github.com/apache/doris-thirdparty/archive/refs/heads/clucene.tar.gz"
+fi
+
+# Handle clean
+if [[ "${CLEAN}" -eq 1 ]]; then
+    pushd "${DORIS_HOME}/gensrc"
+    make clean
+    popd
+    rm -rf "${CMAKE_BUILD_DIR}"
+    rm -rf "${DORIS_HOME}/be/output"
+fi
+
+if [[ ! -d "${CMAKE_BUILD_DIR}" ]]; then
+    mkdir -p "${CMAKE_BUILD_DIR}"
+fi
+
+# Platform defaults (same as run-be-ut.sh / build.sh)
+if [[ -z "${GLIBC_COMPATIBILITY}" ]]; then
+    if [[ "$(uname -s)" != 'Darwin' ]]; then
+        GLIBC_COMPATIBILITY='ON'
+    else
+        GLIBC_COMPATIBILITY='OFF'
+    fi
+fi
+
+if [[ -z "${USE_LIBCPP}" ]]; then
+    if [[ "$(uname -s)" != 'Darwin' ]]; then
+        USE_LIBCPP='OFF'
+    else
+        USE_LIBCPP='ON'
+    fi
+fi
+
+if [[ -z "${USE_AVX2}" ]]; then
+    USE_AVX2='ON'
+fi
+
+if [[ -z "${ARM_MARCH}" ]]; then
+    ARM_MARCH='armv8-a+crc'
+fi
+
+if [[ -z "${USE_UNWIND}" ]]; then
+    if [[ "$(uname -s)" != 'Darwin' ]]; then
+        USE_UNWIND='ON'
+    else
+        USE_UNWIND='OFF'
+    fi
+fi
+
+if [[ -z "${USE_JEMALLOC}" ]]; then
+    if [[ "$(uname -s)" != 'Darwin' ]]; then
+        USE_JEMALLOC='ON'
+    else
+        USE_JEMALLOC='OFF'
+    fi
+fi
+
+if [[ "$(echo "${DISABLE_BUILD_AZURE}" | tr '[:lower:]' '[:upper:]')" == "ON" ]]; then
+    BUILD_AZURE='OFF'
+else
+    BUILD_AZURE='ON'
+fi
+
+MAKE_PROGRAM="$(command -v "${BUILD_SYSTEM}")"
+echo "-- Make program: ${MAKE_PROGRAM}"
+echo "-- Use ccache: ${CMAKE_USE_CCACHE}"
+echo "-- Extra cxx flags: ${EXTRA_CXX_FLAGS:-}"
+
+# Configure and build
+cd "${CMAKE_BUILD_DIR}"
+
+# Only run cmake configure when needed:
+#   1. No CMakeCache.txt yet (first build or after --clean)
+#   2. User explicitly requested --clean
+# Otherwise skip configure and let ninja/make handle incremental builds.
+# Ninja will auto re-configure if CMakeLists.txt files changed.
+if [[ ! -f "${CMAKE_BUILD_DIR}/CMakeCache.txt" ]]; then
+    echo "-- Running cmake configure (first time or after clean) ..."
+    "${CMAKE_CMD}" -G "${GENERATOR}" \
+        -DCMAKE_MAKE_PROGRAM="${MAKE_PROGRAM}" \
+        -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" \
+        -DMAKE_TEST=OFF \
+        -DBUILD_BENCHMARK=ON \
+        -DGLIBC_COMPATIBILITY="${GLIBC_COMPATIBILITY}" \
+        -DUSE_LIBCPP="${USE_LIBCPP}" \
+        -DBUILD_META_TOOL=OFF \
+        -DBUILD_FILE_CACHE_MICROBENCH_TOOL=OFF \
+        -DUSE_UNWIND="${USE_UNWIND}" \
+        -DUSE_JEMALLOC="${USE_JEMALLOC}" \
+        -DUSE_AVX2="${USE_AVX2}" \
+        -DARM_MARCH="${ARM_MARCH}" \
+        -DEXTRA_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \
+        -DENABLE_CLANG_COVERAGE=OFF \
+        -DENABLE_INJECTION_POINT=OFF \
+        ${CMAKE_USE_CCACHE:+${CMAKE_USE_CCACHE}} \
+        -DENABLE_PCH="${ENABLE_PCH}" \
+        -DDORIS_JAVA_HOME="${JAVA_HOME}" \
+        -DBUILD_AZURE="${BUILD_AZURE}" \
+        "${DORIS_HOME}/be"
+else
+    echo "-- Skipping cmake configure (CMakeCache.txt exists, use --clean to force reconfigure)"
+fi
+
+"${BUILD_SYSTEM}" -j "${PARALLEL}" benchmark_test
+
+if [[ "${RUN}" -ne 1 ]]; then
+    echo "Build finished. Binary: ${CMAKE_BUILD_DIR}/bin/benchmark_test"
+    echo "To run: $0 --run [--filter=<regex>]"
+    exit 0
+fi
+
+echo "***********************************"
+echo "   Running Backend Benchmark       "
+echo "***********************************"
+
+cd "${DORIS_HOME}"
+
+# Setup Java env for JNI dependencies
+jdk_version() {
+    local java_cmd="${1}"
+    local result
+    local IFS=$'\n'
+    if [[ -z "${java_cmd}" ]]; then
+        result=no_java
+        return 1
+    else
+        local version
+        version="$("${java_cmd}" -Xms32M -Xmx32M -version 2>&1 | tr '\r' '\n' | grep version | awk '{print $3}')"
+        version="${version//\"/}"
+        if [[ "${version}" =~ ^1\. ]]; then
+            result="$(echo "${version}" | awk -F '.' '{print $2}')"
+        else
+            result="$(echo "${version}" | awk -F '.' '{print $1}')"
+        fi
+    fi
+    echo "${result}"
+    return 0
+}
+
+setup_java_env() {
+    echo "JAVA_HOME: ${JAVA_HOME}"
+    if [[ -z "${JAVA_HOME}" ]]; then
+        return 1
+    fi
+
+    local jvm_arch='amd64'
+    if [[ "$(uname -m)" == 'aarch64' ]]; then
+        jvm_arch='aarch64'
+    fi
+    local java_version
+    java_version="$(
+        set -e
+        jdk_version "${JAVA_HOME}/bin/java"
+    )"
+    if [[ "${java_version}" -gt 8 ]]; then
+        export LD_LIBRARY_PATH="${JAVA_HOME}/lib/server:${JAVA_HOME}/lib:${LD_LIBRARY_PATH}"
+    elif [[ -d "${JAVA_HOME}/jre" ]]; then
+        export LD_LIBRARY_PATH="${JAVA_HOME}/jre/lib/${jvm_arch}/server:${JAVA_HOME}/jre/lib/${jvm_arch}:${LD_LIBRARY_PATH}"
+    else
+        export LD_LIBRARY_PATH="${JAVA_HOME}/lib/${jvm_arch}/server:${JAVA_HOME}/lib/${jvm_arch}:${LD_LIBRARY_PATH}"
+    fi
+
+    if [[ "$(uname -s)" == 'Darwin' ]]; then
+        export DYLD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${DYLD_LIBRARY_PATH}"
+    fi
+}
+
+setup_java_env || true
+
+# Prepare minimal runtime dirs
+BENCHMARK_BINARY="${CMAKE_BUILD_DIR}/bin/benchmark_test"
+
+CONF_DIR="${CMAKE_BUILD_DIR}/conf"
+mkdir -p "${CONF_DIR}"
+cp -f "${DORIS_HOME}/conf/be.conf" "${CONF_DIR}/"
+
+LOG_DIR="${CMAKE_BUILD_DIR}/log"
+mkdir -p "${LOG_DIR}"
+
+export DORIS_HOME="${CMAKE_BUILD_DIR}"
+export TERM="xterm"
+
+# Prepare java classpath
+LIB_DIR="${CMAKE_BUILD_DIR}/lib/"
+mkdir -p "${LIB_DIR}"
+if [[ -d "${DORIS_THIRDPARTY}/installed/lib/hadoop_hdfs/" ]]; then
+    cp -r "${DORIS_THIRDPARTY}/installed/lib/hadoop_hdfs/" "${LIB_DIR}" 2>/dev/null || true
+fi
+
+DORIS_CLASSPATH=""
+for f in "${LIB_DIR}"/*.jar; do
+    [[ -f "${f}" ]] || continue
+    if [[ -z "${DORIS_CLASSPATH}" ]]; then
+        DORIS_CLASSPATH="${f}"
+    else
+        DORIS_CLASSPATH="${f}:${DORIS_CLASSPATH}"
+    fi
+done
+if [[ -d "${LIB_DIR}/hadoop_hdfs/" ]]; then
+    for f in "${LIB_DIR}/hadoop_hdfs/common"/*.jar; do
+        [[ -f "${f}" ]] || continue
+        DORIS_CLASSPATH="${f}:${DORIS_CLASSPATH}"
+    done
+    for f in "${LIB_DIR}/hadoop_hdfs/common/lib"/*.jar; do
+        [[ -f "${f}" ]] || continue
+        DORIS_CLASSPATH="${f}:${DORIS_CLASSPATH}"
+    done
+    for f in "${LIB_DIR}/hadoop_hdfs/hdfs"/*.jar; do
+        [[ -f "${f}" ]] || continue
+        DORIS_CLASSPATH="${f}:${DORIS_CLASSPATH}"
+    done
+    for f in "${LIB_DIR}/hadoop_hdfs/hdfs/lib"/*.jar; do
+        [[ -f "${f}" ]] || continue
+        DORIS_CLASSPATH="${f}:${DORIS_CLASSPATH}"
+    done
+fi
+export CLASSPATH="${DORIS_CLASSPATH}"
+export DORIS_CLASSPATH="-Djava.class.path=${DORIS_CLASSPATH}"
+
+CUR_DATE=$(date +%Y%m%d-%H%M%S)
+export JAVA_OPTS="-Xmx1024m -DlogPath=${LOG_DIR}/jni.log -Xloggc:${LOG_DIR}/be.gc.log.${CUR_DATE} -Dsun.java.command=DorisBEBenchmark -XX:-CriticalJNINatives -DJDBC_MIN_POOL=1 -DJDBC_MAX_POOL=100 -DJDBC_MAX_IDLE_TIME=300000"
+export LIBHDFS_OPTS="${JAVA_OPTS}"
+
+# Run the benchmark
+if [[ ! -f "${BENCHMARK_BINARY}" ]]; then
+    echo "Error: benchmark binary not found: ${BENCHMARK_BINARY}"
+    exit 1
+fi
+
+BENCHMARK_ARGS=()
+if [[ -n "${FILTER}" ]]; then
+    BENCHMARK_ARGS+=("--benchmark_filter=${FILTER}")
+fi
+
+echo "Running: ${BENCHMARK_BINARY} ${BENCHMARK_ARGS[*]}"
+"${BENCHMARK_BINARY}" "${BENCHMARK_ARGS[@]}"
+
+echo "=== Benchmark finished ==="