Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions vortex-btrblocks/src/compressor/integer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ impl Hash for dyn IntegerScheme {
}

/// Unique identifier for integer compression schemes.
///
/// NOTE: Variant order matters for tie-breaking; `For` must precede `BitPacking` to avoid unnecessary patches.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence, Ord, PartialOrd)]
pub enum IntCode {
/// No compression applied.
Expand All @@ -142,10 +144,10 @@ pub enum IntCode {
Constant,
/// Frame of Reference encoding - subtracts minimum value then bitpacks.
For,
/// ZigZag encoding - transforms negative integers to positive for better bitpacking.
ZigZag,
/// BitPacking encoding - compresses non-negative integers by reducing bit width.
BitPacking,
/// ZigZag encoding - transforms negative integers to positive for better bitpacking.
ZigZag,
/// Sparse encoding - optimizes null-dominated or single-value-dominated arrays.
Sparse,
/// Dictionary encoding - creates a dictionary of unique values.
Expand Down Expand Up @@ -342,19 +344,28 @@ impl Scheme for FORScheme {
.bit_width()
.try_into()
.vortex_expect("bit width must fit in u32");
let bw = match stats.typed.max_minus_min().checked_ilog2() {
let for_bw = match stats.typed.max_minus_min().checked_ilog2() {
Some(l) => l + 1,
// If max-min == 0, it we should use a different compression scheme
// as we don't want to bitpack down to 0 bits.
None => return Ok(0.0),
};

// If we're not saving at least 1 byte, don't bother with FOR
if full_width - bw < 8 {
return Ok(0.0);
// If BitPacking could apply (non-negative values) and FOR doesn't reduce bit width
// compared to BitPacking, don't use FOR since it has overhead (storing reference).
// Only skip FOR when min >= 0, otherwise BitPacking can't apply directly.
if let Some(max_log) = stats
.typed
.max_ilog2()
.filter(|_| !stats.typed.min_is_negative())
{
let bitpack_bw = max_log + 1;
if for_bw >= bitpack_bw {
return Ok(0.0);
}
}

Ok(full_width as f64 / bw as f64)
Ok(full_width as f64 / for_bw as f64)
}

fn compress(
Expand Down
22 changes: 22 additions & 0 deletions vortex-btrblocks/src/compressor/integer/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,28 @@ impl ErasedStats {
}
}

/// Returns the ilog2 of the max value when transmuted to unsigned, or None if zero.
///
/// This matches how BitPacking computes bit width: it reinterprets signed values as
/// unsigned (preserving bit pattern) and uses leading_zeros. For non-negative signed
/// values, the transmuted value equals the original value.
///
/// This is used to determine if FOR encoding would reduce bit width compared to
/// direct BitPacking. If `max_ilog2() == max_minus_min_ilog2()`, FOR doesn't help.
pub fn max_ilog2(&self) -> Option<u32> {
match &self {
ErasedStats::U8(x) => x.max.checked_ilog2(),
ErasedStats::U16(x) => x.max.checked_ilog2(),
ErasedStats::U32(x) => x.max.checked_ilog2(),
ErasedStats::U64(x) => x.max.checked_ilog2(),
// Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior
ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
}
}

/// Get the most commonly occurring value and its count
pub fn top_value_and_count(&self) -> (PValue, u32) {
match &self {
Expand Down
2 changes: 1 addition & 1 deletion vortex-python/src/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ impl PyVortexWriteOptions {
/// >>> vx.io.VortexWriteOptions.default().write(sprl, "chonky.vortex")
/// >>> import os
/// >>> os.path.getsize('chonky.vortex')
/// 216156
/// 216020
/// ```
///
/// Wow, Vortex manages to use about two bytes per integer! So advanced. So tiny.
Expand Down
2 changes: 1 addition & 1 deletion vortex-test/e2e/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ mod tests {
PrimitiveArray::new(Buffer::from_iter(values), Validity::NonNullable).into_array();

// Write in parallel and verify all sizes match expected
const EXPECTED_SIZE: usize = 216156;
const EXPECTED_SIZE: usize = 216020;
let futures: Vec<_> = (0..5)
.map(|_| {
let array = array.clone();
Expand Down
Loading