Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ members = [
"vortex-tui",
"vortex-test/e2e",
"vortex-test/e2e-cuda",
"vortex-test/e2e-cuda-scan",
"xtask",
# Encodings
"encodings/fastlanes",
Expand Down
5 changes: 5 additions & 0 deletions vortex-array/src/arrays/primitive/vtable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ impl VTable for PrimitiveVTable {

let ptype = PType::try_from(dtype)?;

vortex_ensure!(
buffer.is_aligned_to(Alignment::new(ptype.byte_width())),
"Misaligned buffer cannot be used to build PrimitiveArray of {ptype}"
);

if buffer.len() != ptype.byte_width() * len {
vortex_bail!(
"Buffer length {} does not match expected length {} for {}, {}",
Expand Down
9 changes: 9 additions & 0 deletions vortex-btrblocks/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ impl Default for BtrBlocksCompressorBuilder {
}

impl BtrBlocksCompressorBuilder {
/// Create a new builder with no encodings enabled.
pub fn empty() -> Self {
Self {
int_schemes: Default::default(),
float_schemes: Default::default(),
string_schemes: Default::default(),
}
}

/// Excludes the specified integer compression schemes.
pub fn exclude_int(mut self, codes: impl IntoIterator<Item = IntCode>) -> Self {
let codes: HashSet<_> = codes.into_iter().collect();
Expand Down
2 changes: 2 additions & 0 deletions vortex-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@ arc-swap = { workspace = true }
arrow-data = { workspace = true, features = ["ffi"] }
arrow-schema = { workspace = true, features = ["ffi"] }
async-trait = { workspace = true }
bytes = { workspace = true }
cudarc = { workspace = true, features = ["f16"] }
fastlanes = { workspace = true }
futures = { workspace = true, features = ["executor"] }
kanal = { workspace = true }
paste = { workspace = true }
tokio = { workspace = true, features = ["fs"] }
tracing = { workspace = true }
vortex-alp = { workspace = true }
vortex-array = { workspace = true }
Expand Down
69 changes: 63 additions & 6 deletions vortex-cuda/benches/for_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ use vortex_cuda::CudaSession;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;
use vortex_dtype::NativePType;
use vortex_dtype::PType;
use vortex_error::VortexExpect;
use vortex_fastlanes::BitPackedArray;
use vortex_fastlanes::FoRArray;
use vortex_scalar::Scalar;
use vortex_session::VortexSession;
Expand All @@ -36,20 +38,28 @@ const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")];
const REFERENCE_VALUE: u8 = 10;

/// Creates a FoR array with the specified type and length.
fn make_for_array_typed<T>(len: usize) -> FoRArray
fn make_for_array_typed<T>(len: usize, bp: bool) -> FoRArray
where
T: NativePType + From<u8> + Add<Output = T>,
Scalar: From<T>,
{
let reference = <T as From<u8>>::from(REFERENCE_VALUE);
let data: Vec<T> = (0..len)
.map(|i| <T as From<u8>>::from((i % 256) as u8) + reference)
.map(|i| <T as From<u8>>::from((i % 256) as u8))
.collect();

let primitive_array =
PrimitiveArray::new(Buffer::from(data), Validity::NonNullable).into_array();

FoRArray::try_new(primitive_array, reference.into()).vortex_expect("failed to create FoR array")
if bp && T::PTYPE != PType::U8 {
let child =
BitPackedArray::encode(primitive_array.as_ref(), 8).vortex_expect("failed to bitpack");
FoRArray::try_new(child.into_array(), reference.into())
.vortex_expect("failed to create FoR array")
} else {
FoRArray::try_new(primitive_array, reference.into())
.vortex_expect("failed to create FoR array")
}
}

/// Launches FoR decompression kernel and returns elapsed GPU time.
Expand Down Expand Up @@ -95,10 +105,49 @@ where
let mut group = c.benchmark_group("for_cuda");
group.sample_size(10);

for (len, len_str) in BENCH_ARGS {
for &(len, len_str) in BENCH_ARGS {
group.throughput(Throughput::Bytes((len * size_of::<T>()) as u64));

let for_array = make_for_array_typed::<T>(len, false);

group.bench_with_input(
BenchmarkId::new("for", format!("{len_str}_{type_name}")),
&for_array,
|b, for_array| {
b.iter_custom(|iters| {
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context");

let mut total_time = Duration::ZERO;

for _ in 0..iters {
let kernel_time =
launch_for_kernel_timed_typed::<T>(for_array, &mut cuda_ctx)
.vortex_expect("kernel launch failed");
total_time += kernel_time;
}

total_time
});
},
);
}

group.finish();
}

fn benchmark_ffor_typed<T>(c: &mut Criterion, type_name: &str)
where
T: NativePType + DeviceRepr + From<u8> + Add<Output = T>,
Scalar: From<T>,
{
let mut group = c.benchmark_group("ffor_cuda");
group.sample_size(10);

for &(len, len_str) in BENCH_ARGS {
group.throughput(Throughput::Bytes((len * size_of::<T>()) as u64));

let for_array = make_for_array_typed::<T>(*len);
let for_array = make_for_array_typed::<T>(len, true);

group.bench_with_input(
BenchmarkId::new("for", format!("{len_str}_{type_name}")),
Expand Down Expand Up @@ -134,7 +183,15 @@ fn benchmark_for(c: &mut Criterion) {
benchmark_for_typed::<u64>(c, "u64");
}

criterion::criterion_group!(benches, benchmark_for);
/// Benchmark FOR+BP decompression for all types.
fn benchmark_ffor(c: &mut Criterion) {
benchmark_ffor_typed::<u8>(c, "u8");
benchmark_ffor_typed::<u16>(c, "u16");
benchmark_ffor_typed::<u32>(c, "u32");
benchmark_ffor_typed::<u64>(c, "u64");
}

criterion::criterion_group!(benches, benchmark_for, benchmark_ffor);

#[cuda_available]
criterion::criterion_main!(benches);
Expand Down
20 changes: 10 additions & 10 deletions vortex-cuda/cuda_kernel_generator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,22 @@ fn generate_lane_decoder<T: FastLanes, W: Write>(

writeln!(
output,
"__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, unsigned int lane) {{"
"__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, const uint{bits}_t reference, unsigned int lane) {{"
)?;

output.indent(|output| {
writeln!(output, "unsigned int LANE_COUNT = {lanes};")?;
if bit_width == 0 {
writeln!(output, "uint{bits}_t zero = 0ULL;")?;
writeln!(output)?;
for row in 0..bits {
writeln!(output, "out[INDEX({row}, lane)] = zero;")?;
writeln!(output, "out[INDEX({row}, lane)] = reference;")?;
}
} else if bit_width == bits {
writeln!(output)?;
for row in 0..bits {
writeln!(
output,
"out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];",
"out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane] + reference;",
)?;
}
} else {
Expand Down Expand Up @@ -72,7 +71,7 @@ fn generate_lane_decoder<T: FastLanes, W: Write>(
)?;
}

writeln!(output, "out[INDEX({row}, lane)] = tmp;")?;
writeln!(output, "out[INDEX({row}, lane)] = tmp + reference;")?;
}
}
Ok(())
Expand All @@ -93,14 +92,14 @@ fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");

let local_func_params = format!(
"(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, int thread_idx)"
"(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx)"
);

writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?;

output.indent(|output| {
for thread_lane in 0..per_thread_loop_count {
writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, out, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, out, reference, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
}
Ok(())
})?;
Expand All @@ -116,8 +115,9 @@ fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
let bits = <T>::T;

let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
let func_params =
format!("(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out)");
let func_params = format!(
"(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out, uint{bits}_t reference)"
);

writeln!(
output,
Expand All @@ -132,7 +132,7 @@ fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
)?;
writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?;

writeln!(output, "_{func_name}(in, out, thread_idx);")
writeln!(output, "_{func_name}(in, out, reference, thread_idx);")
})?;

writeln!(output, "}}")
Expand Down
Loading