-
Notifications
You must be signed in to change notification settings - Fork 131
add GHArchive random access benchmark #6372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -6,22 +6,39 @@ use std::time::Duration; | |||||||||
| use std::time::Instant; | ||||||||||
|
|
||||||||||
| use clap::Parser; | ||||||||||
| use clap::ValueEnum; | ||||||||||
| use indicatif::ProgressBar; | ||||||||||
| use vortex_bench::BenchmarkOutput; | ||||||||||
| use vortex_bench::Engine; | ||||||||||
| use vortex_bench::Format; | ||||||||||
| use vortex_bench::Target; | ||||||||||
| use vortex_bench::create_output_writer; | ||||||||||
| use vortex_bench::datasets::gharchive::gharchive_parquet; | ||||||||||
| use vortex_bench::datasets::gharchive::gharchive_vortex; | ||||||||||
| use vortex_bench::datasets::taxi_data::*; | ||||||||||
| use vortex_bench::display::DisplayFormat; | ||||||||||
| use vortex_bench::display::print_measurements_json; | ||||||||||
| use vortex_bench::display::render_table; | ||||||||||
| use vortex_bench::measurements::TimingMeasurement; | ||||||||||
| use vortex_bench::random_access::FieldPath; | ||||||||||
| use vortex_bench::random_access::ParquetProjectingAccessor; | ||||||||||
| use vortex_bench::random_access::ParquetRandomAccessor; | ||||||||||
| use vortex_bench::random_access::ProjectingRandomAccessor; | ||||||||||
| use vortex_bench::random_access::RandomAccessor; | ||||||||||
| use vortex_bench::random_access::VortexProjectingAccessor; | ||||||||||
| use vortex_bench::random_access::VortexRandomAccessor; | ||||||||||
| use vortex_bench::setup_logging_and_tracing; | ||||||||||
| use vortex_bench::utils::constants::STORAGE_NVME; | ||||||||||
|
|
||||||||||
| /// Available datasets for random access benchmarks. | ||||||||||
| #[derive(Clone, Copy, Debug, Default, ValueEnum)] | ||||||||||
| enum Dataset { | ||||||||||
| /// NYC Taxi trip data - flat schema with many columns. | ||||||||||
| #[default] | ||||||||||
| Taxi, | ||||||||||
| /// GitHub Archive event data - deeply nested schema with struct fields. | ||||||||||
| GhArchive, | ||||||||||
| } | ||||||||||
|
|
||||||||||
| #[derive(Parser, Debug)] | ||||||||||
| #[command(version, about, long_about = None)] | ||||||||||
| struct Args { | ||||||||||
|
|
@@ -32,6 +49,9 @@ struct Args { | |||||||||
| default_values_t = vec![Format::Parquet, Format::OnDiskVortex] | ||||||||||
| )] | ||||||||||
| formats: Vec<Format>, | ||||||||||
| /// Dataset to benchmark. | ||||||||||
| #[arg(long, value_enum, default_value_t = Dataset::Taxi)] | ||||||||||
| dataset: Dataset, | ||||||||||
| /// Time limit in seconds for each benchmark target (e.g., 10 for 10 seconds). | ||||||||||
| #[arg(long, default_value_t = 10)] | ||||||||||
| time_limit: u64, | ||||||||||
|
|
@@ -51,21 +71,40 @@ async fn main() -> anyhow::Result<()> { | |||||||||
|
|
||||||||||
| setup_logging_and_tracing(args.verbose, args.tracing)?; | ||||||||||
|
|
||||||||||
| // Row count of the dataset is 3,339,715. | ||||||||||
| let indices = vec![10u64, 11, 12, 13, 100_000, 3_000_000]; | ||||||||||
|
|
||||||||||
| run_random_access( | ||||||||||
| args.formats, | ||||||||||
| args.time_limit, | ||||||||||
| args.display_format, | ||||||||||
| indices, | ||||||||||
| args.output_path, | ||||||||||
| ) | ||||||||||
| .await | ||||||||||
| match args.dataset { | ||||||||||
| Dataset::Taxi => { | ||||||||||
| // Row count of the taxi dataset is 3,339,715. | ||||||||||
| let indices = vec![10u64, 11, 12, 13, 100_000, 3_000_000]; | ||||||||||
| run_taxi_random_access( | ||||||||||
| args.formats, | ||||||||||
| args.time_limit, | ||||||||||
| args.display_format, | ||||||||||
| indices, | ||||||||||
| args.output_path, | ||||||||||
| ) | ||||||||||
| .await | ||||||||||
| } | ||||||||||
| Dataset::GhArchive => { | ||||||||||
| // Run gharchive benchmark with nested field projection. | ||||||||||
| // The field path is payload.ref - a deeply nested string field. | ||||||||||
| let field_path = vec!["actor".to_string(), "login".to_string()]; | ||||||||||
| // Use smaller indices as gharchive may have fewer rows per row group. | ||||||||||
| let indices = vec![10u64, 11, 12, 13, 1_000, 10_000]; | ||||||||||
| run_gharchive_random_access( | ||||||||||
| args.formats, | ||||||||||
| args.time_limit, | ||||||||||
| args.display_format, | ||||||||||
| indices, | ||||||||||
| field_path, | ||||||||||
| args.output_path, | ||||||||||
| ) | ||||||||||
| .await | ||||||||||
| } | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /// Create a random accessor for the given format using taxi data. | ||||||||||
| async fn get_accessor(format: Format) -> anyhow::Result<Box<dyn RandomAccessor>> { | ||||||||||
| async fn get_taxi_accessor(format: Format) -> anyhow::Result<Box<dyn RandomAccessor>> { | ||||||||||
| match format { | ||||||||||
| Format::OnDiskVortex => { | ||||||||||
| let path = taxi_data_vortex().await?; | ||||||||||
|
|
@@ -91,6 +130,28 @@ async fn get_accessor(format: Format) -> anyhow::Result<Box<dyn RandomAccessor>> | |||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /// Create a projecting random accessor for the given format using gharchive data. | ||||||||||
| async fn get_gharchive_accessor( | ||||||||||
| format: Format, | ||||||||||
| ) -> anyhow::Result<Box<dyn ProjectingRandomAccessor>> { | ||||||||||
| match format { | ||||||||||
| Format::OnDiskVortex => { | ||||||||||
| let path = gharchive_vortex().await?; | ||||||||||
| Ok(Box::new(VortexProjectingAccessor::new(path))) | ||||||||||
| } | ||||||||||
| Format::VortexCompact => { | ||||||||||
| // For now, use the same path as OnDiskVortex (compact not yet implemented for gharchive) | ||||||||||
| let path = gharchive_vortex().await?; | ||||||||||
| Ok(Box::new(VortexProjectingAccessor::compact(path))) | ||||||||||
|
Comment on lines
+143
to
+145
|
||||||||||
| // For now, use the same path as OnDiskVortex (compact not yet implemented for gharchive) | |
| let path = gharchive_vortex().await?; | |
| Ok(Box::new(VortexProjectingAccessor::compact(path))) | |
| unimplemented!("Compact gharchive dataset path is not yet implemented"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment says the field path is
payload.ref, but the code actually benchmarksactor.login. Please update the comment (or change the field path) so benchmark output/intent is clear and doesn’t drift from the implementation.