Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 138 additions & 3 deletions datafusion/common/benches/with_hashes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@
use ahash::RandomState;
use arrow::array::{
Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, GenericStringArray,
NullBufferBuilder, OffsetSizeTrait, PrimitiveArray, StringViewArray, make_array,
NullBufferBuilder, OffsetSizeTrait, PrimitiveArray, RunArray, StringArray,
StringViewArray, StructArray, make_array,
};
use arrow::buffer::NullBuffer;
use arrow::datatypes::{ArrowDictionaryKeyType, Int32Type, Int64Type};
use arrow::datatypes::{
ArrowDictionaryKeyType, DataType, Field, Fields, Int32Type, Int64Type,
};
use criterion::{Bencher, Criterion, criterion_group, criterion_main};
use datafusion_common::hash_utils::with_hashes;
use rand::Rng;
Expand Down Expand Up @@ -68,11 +71,25 @@ fn criterion_benchmark(c: &mut Criterion) {
name: "dictionary_utf8_int32",
array: pool.dictionary_array::<Int32Type>(BATCH_SIZE),
},
BenchData {
name: "struct_array",
array: create_struct_array(BATCH_SIZE),
},
BenchData {
name: "run_array_int32",
array: create_run_array::<Int32Type>(BATCH_SIZE),
},
];

for BenchData { name, array } in cases {
// with_hash has different code paths for single vs multiple arrays and nulls vs no nulls
let nullable_array = add_nulls(&array);
// RunArray encodes nulls in the values array, not at the array level
let nullable_array = if name.starts_with("run_array") {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should take the approach by #20179 to have this as an explicit property instead of checking by name

create_run_array_with_null_values::<Int32Type>(BATCH_SIZE)
} else {
add_nulls(&array)
};

c.bench_function(&format!("{name}: single, no nulls"), |b| {
do_hash_test(b, std::slice::from_ref(&array));
});
Expand Down Expand Up @@ -205,5 +222,123 @@ where
Arc::new(array)
}

/// Create a StructArray with multiple columns
fn create_struct_array(array_len: usize) -> ArrayRef {
let mut rng = make_rng();

// Create 4 columns of different types for our struct array
let bool_array: ArrayRef = Arc::new(
(0..array_len)
.map(|_| Some(rng.random::<bool>()))
.collect::<arrow::array::BooleanArray>(),
);

let int32_array: ArrayRef = Arc::new(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we could reuse the existing functions above to create these random arrays? e.g. primitive_array()

The only difference I see is that it uses its own rng; is this a big concern, considering each of these functions currently use their own rng anyway?

(0..array_len)
.map(|_| Some(rng.random::<i32>()))
.collect::<PrimitiveArray<Int32Type>>(),
);

let int64_array: ArrayRef = Arc::new(
(0..array_len)
.map(|_| Some(rng.random::<i64>()))
.collect::<PrimitiveArray<Int64Type>>(),
);

let string_array: ArrayRef = {
let strings: Vec<String> = (0..array_len)
.map(|_| {
let len = rng.random_range(5..20);
let value: Vec<u8> =
rng.clone().sample_iter(&Alphanumeric).take(len).collect();
String::from_utf8(value).unwrap()
})
.collect();
Arc::new(StringArray::from(strings))
};

let fields = Fields::from(vec![
Field::new("bool_col", DataType::Boolean, false),
Field::new("int32_col", DataType::Int32, false),
Field::new("int64_col", DataType::Int64, false),
Field::new("string_col", DataType::Utf8, false),
]);

Arc::new(StructArray::new(
fields,
vec![bool_array, int32_array, int64_array, string_array],
None,
))
}

/// Create a RunArray to test run array hashing.
fn create_run_array<T>(array_len: usize) -> ArrayRef
where
T: ArrowPrimitiveType,
StandardUniform: Distribution<T::Native>,
{
let mut rng = make_rng();

// Create runs of varying lengths
let mut run_ends = Vec::new();
let mut values = Vec::new();
let mut current_end = 0;

while current_end < array_len {
// Random run length between 1 and 50
let run_length = rng.random_range(1..=50).min(array_len - current_end);
current_end += run_length;
run_ends.push(current_end as i32);
values.push(Some(rng.random::<T::Native>()));
}

let run_ends_array = Arc::new(PrimitiveArray::<Int32Type>::from(run_ends));
let values_array: Arc<dyn Array> =
Arc::new(values.into_iter().collect::<PrimitiveArray<T>>());

Arc::new(
RunArray::try_new(&run_ends_array, values_array.as_ref())
.expect("Failed to create RunArray"),
)
}

/// Create a RunArray with null values
fn create_run_array_with_null_values<T>(array_len: usize) -> ArrayRef
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel we can reduce the duplication with above function here if the only difference is nulls 🤔

It would just be a matter of calling add_nulls() on the values array

where
T: ArrowPrimitiveType,
StandardUniform: Distribution<T::Native>,
{
let mut rng = make_rng();
let null_density = 0.03; // Same as create_null_mask()

// Create runs of varying lengths
let mut run_ends = Vec::new();
let mut values = Vec::new();
let mut current_end = 0;

while current_end < array_len {
// Random run length between 1 and 50
let run_length = rng.random_range(1..=50).min(array_len - current_end);
current_end += run_length;
run_ends.push(current_end as i32);

// Randomly make some values null based on null_density
if rng.random::<f32>() < null_density {
values.push(None);
} else {
values.push(Some(rng.random::<T::Native>()));
}
}

let run_ends_array = Arc::new(PrimitiveArray::<Int32Type>::from(run_ends));
let values_array: Arc<dyn Array> =
Arc::new(values.into_iter().collect::<PrimitiveArray<T>>());

Arc::new(
RunArray::try_new(&run_ends_array, values_array.as_ref())
.expect("Failed to create RunArray with null values"),
)
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);