vortex-data · a10y · Feb 9, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -36,6 +36,7 @@ members = [
     "vortex-tui",
     "vortex-test/e2e",
     "vortex-test/e2e-cuda",
+    "vortex-test/e2e-cuda-scan",
     "xtask",
     # Encodings
     "encodings/fastlanes",

diff --git a/vortex-array/src/arrays/primitive/vtable/mod.rs b/vortex-array/src/arrays/primitive/vtable/mod.rs
@@ -88,6 +88,11 @@ impl VTable for PrimitiveVTable {
 
         let ptype = PType::try_from(dtype)?;
 
+        vortex_ensure!(
+            buffer.is_aligned_to(Alignment::new(ptype.byte_width())),
+            "Misaligned buffer cannot be used to build PrimitiveArray of {ptype}"
+        );
+
         if buffer.len() != ptype.byte_width() * len {
             vortex_bail!(
                 "Buffer length {} does not match expected length {} for {}, {}",

diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs
@@ -71,6 +71,15 @@ impl Default for BtrBlocksCompressorBuilder {
 }
 
 impl BtrBlocksCompressorBuilder {
+    /// Create a new builder with no encodings enabled.
+    pub fn empty() -> Self {
+        Self {
+            int_schemes: Default::default(),
+            float_schemes: Default::default(),
+            string_schemes: Default::default(),
+        }
+    }
+
     /// Excludes the specified integer compression schemes.
     pub fn exclude_int(mut self, codes: impl IntoIterator<Item = IntCode>) -> Self {
         let codes: HashSet<_> = codes.into_iter().collect();

diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
@@ -25,11 +25,13 @@ arc-swap = { workspace = true }
 arrow-data = { workspace = true, features = ["ffi"] }
 arrow-schema = { workspace = true, features = ["ffi"] }
 async-trait = { workspace = true }
+bytes = { workspace = true }
 cudarc = { workspace = true, features = ["f16"] }
 fastlanes = { workspace = true }
 futures = { workspace = true, features = ["executor"] }
 kanal = { workspace = true }
 paste = { workspace = true }
+tokio = { workspace = true, features = ["fs"] }
 tracing = { workspace = true }
 vortex-alp = { workspace = true }
 vortex-array = { workspace = true }

diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs
@@ -27,7 +27,9 @@ use vortex_cuda::CudaSession;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
+use vortex_dtype::PType;
 use vortex_error::VortexExpect;
+use vortex_fastlanes::BitPackedArray;
 use vortex_fastlanes::FoRArray;
 use vortex_scalar::Scalar;
 use vortex_session::VortexSession;
@@ -36,20 +38,28 @@ const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
 const REFERENCE_VALUE: u8 = 10;
 
 /// Creates a FoR array with the specified type and length.
-fn make_for_array_typed<T>(len: usize) -> FoRArray
+fn make_for_array_typed<T>(len: usize, bp: bool) -> FoRArray
 where
     T: NativePType + From<u8> + Add<Output = T>,
     Scalar: From<T>,
 {
     let reference = <T as From<u8>>::from(REFERENCE_VALUE);
     let data: Vec<T> = (0..len)
-        .map(|i| <T as From<u8>>::from((i % 256) as u8) + reference)
+        .map(|i| <T as From<u8>>::from((i % 256) as u8))
         .collect();
 
     let primitive_array =
         PrimitiveArray::new(Buffer::from(data), Validity::NonNullable).into_array();
 
-    FoRArray::try_new(primitive_array, reference.into()).vortex_expect("failed to create FoR array")
+    if bp && T::PTYPE != PType::U8 {
+        let child =
+            BitPackedArray::encode(primitive_array.as_ref(), 8).vortex_expect("failed to bitpack");
+        FoRArray::try_new(child.into_array(), reference.into())
+            .vortex_expect("failed to create FoR array")
+    } else {
+        FoRArray::try_new(primitive_array, reference.into())
+            .vortex_expect("failed to create FoR array")
+    }
 }
 
 /// Launches FoR decompression kernel and returns elapsed GPU time.
@@ -95,10 +105,49 @@ where
     let mut group = c.benchmark_group("for_cuda");
     group.sample_size(10);
 
-    for (len, len_str) in BENCH_ARGS {
+    for &(len, len_str) in BENCH_ARGS {
+        group.throughput(Throughput::Bytes((len * size_of::<T>()) as u64));
+
+        let for_array = make_for_array_typed::<T>(len, false);
+
+        group.bench_with_input(
+            BenchmarkId::new("for", format!("{len_str}_{type_name}")),
+            &for_array,
+            |b, for_array| {
+                b.iter_custom(|iters| {
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                        .vortex_expect("failed to create execution context");
+
+                    let mut total_time = Duration::ZERO;
+
+                    for _ in 0..iters {
+                        let kernel_time =
+                            launch_for_kernel_timed_typed::<T>(for_array, &mut cuda_ctx)
+                                .vortex_expect("kernel launch failed");
+                        total_time += kernel_time;
+                    }
+
+                    total_time
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn benchmark_ffor_typed<T>(c: &mut Criterion, type_name: &str)
+where
+    T: NativePType + DeviceRepr + From<u8> + Add<Output = T>,
+    Scalar: From<T>,
+{
+    let mut group = c.benchmark_group("ffor_cuda");
+    group.sample_size(10);
+
+    for &(len, len_str) in BENCH_ARGS {
         group.throughput(Throughput::Bytes((len * size_of::<T>()) as u64));
 
-        let for_array = make_for_array_typed::<T>(*len);
+        let for_array = make_for_array_typed::<T>(len, true);
 
         group.bench_with_input(
             BenchmarkId::new("for", format!("{len_str}_{type_name}")),
@@ -134,7 +183,15 @@ fn benchmark_for(c: &mut Criterion) {
     benchmark_for_typed::<u64>(c, "u64");
 }
 
-criterion::criterion_group!(benches, benchmark_for);
+/// Benchmark FOR+BP decompression for all types.
+fn benchmark_ffor(c: &mut Criterion) {
+    benchmark_ffor_typed::<u8>(c, "u8");
+    benchmark_ffor_typed::<u16>(c, "u16");
+    benchmark_ffor_typed::<u32>(c, "u32");
+    benchmark_ffor_typed::<u64>(c, "u64");
+}
+
+criterion::criterion_group!(benches, benchmark_for, benchmark_ffor);
 
 #[cuda_available]
 criterion::criterion_main!(benches);

diff --git a/vortex-cuda/cuda_kernel_generator/mod.rs b/vortex-cuda/cuda_kernel_generator/mod.rs
@@ -20,23 +20,22 @@ fn generate_lane_decoder<T: FastLanes, W: Write>(
 
     writeln!(
         output,
-        "__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, unsigned int lane) {{"
+        "__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, const uint{bits}_t reference, unsigned int lane) {{"
     )?;
 
     output.indent(|output| {
         writeln!(output, "unsigned int LANE_COUNT = {lanes};")?;
         if bit_width == 0 {
-            writeln!(output, "uint{bits}_t zero = 0ULL;")?;
             writeln!(output)?;
             for row in 0..bits {
-                writeln!(output, "out[INDEX({row}, lane)] = zero;")?;
+                writeln!(output, "out[INDEX({row}, lane)] = reference;")?;
             }
         } else if bit_width == bits {
             writeln!(output)?;
             for row in 0..bits {
                 writeln!(
                     output,
-                    "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];",
+                    "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane] + reference;",
                 )?;
             }
         } else {
@@ -72,7 +71,7 @@ fn generate_lane_decoder<T: FastLanes, W: Write>(
                     )?;
                 }
 
-                writeln!(output, "out[INDEX({row}, lane)] = tmp;")?;
+                writeln!(output, "out[INDEX({row}, lane)] = tmp + reference;")?;
             }
         }
         Ok(())
@@ -93,14 +92,14 @@ fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
     let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
 
     let local_func_params = format!(
-        "(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, int thread_idx)"
+        "(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx)"
     );
 
     writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?;
 
     output.indent(|output| {
         for thread_lane in 0..per_thread_loop_count {
-            writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, out, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
+            writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, out, reference, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
         }
         Ok(())
     })?;
@@ -116,8 +115,9 @@ fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
     let bits = <T>::T;
 
     let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
-    let func_params =
-        format!("(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out)");
+    let func_params = format!(
+        "(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out, uint{bits}_t reference)"
+    );
 
     writeln!(
         output,
@@ -132,7 +132,7 @@ fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
         )?;
         writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?;
 
-        writeln!(output, "_{func_name}(in, out, thread_idx);")
+        writeln!(output, "_{func_name}(in, out, reference, thread_idx);")
     })?;
 
     writeln!(output, "}}")