diff --git a/Cargo.lock b/Cargo.lock index 578056b75b5..3e2d5340837 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10375,6 +10375,7 @@ dependencies = [ "arrow-data", "arrow-schema", "async-trait", + "bytes", "codspeed-criterion-compat-walltime", "cudarc", "fastlanes", @@ -11064,6 +11065,19 @@ dependencies = [ "vortex-cuda", ] +[[package]] +name = "vortex-test-e2e-cuda-scan" +version = "0.1.0" +dependencies = [ + "arrow-array", + "arrow-schema", + "futures", + "tokio", + "tracing-subscriber", + "vortex", + "vortex-cuda", +] + [[package]] name = "vortex-tui" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index ba9a0268b87..5e6848072e9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ members = [ "vortex-tui", "vortex-test/e2e", "vortex-test/e2e-cuda", + "vortex-test/e2e-cuda-scan", "xtask", # Encodings "encodings/fastlanes", diff --git a/vortex-array/src/arrays/primitive/vtable/mod.rs b/vortex-array/src/arrays/primitive/vtable/mod.rs index 270c08de8af..753a62cb97a 100644 --- a/vortex-array/src/arrays/primitive/vtable/mod.rs +++ b/vortex-array/src/arrays/primitive/vtable/mod.rs @@ -88,6 +88,11 @@ impl VTable for PrimitiveVTable { let ptype = PType::try_from(dtype)?; + vortex_ensure!( + buffer.is_aligned_to(Alignment::new(ptype.byte_width())), + "Misaligned buffer cannot be used to build PrimitiveArray of {ptype}" + ); + if buffer.len() != ptype.byte_width() * len { vortex_bail!( "Buffer length {} does not match expected length {} for {}, {}", diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index 1ba5fd57f09..d5f42c2246d 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -71,6 +71,15 @@ impl Default for BtrBlocksCompressorBuilder { } impl BtrBlocksCompressorBuilder { + /// Create a new builder with no encodings enabled. + pub fn empty() -> Self { + Self { + int_schemes: Default::default(), + float_schemes: Default::default(), + string_schemes: Default::default(), + } + } + /// Excludes the specified integer compression schemes. pub fn exclude_int(mut self, codes: impl IntoIterator) -> Self { let codes: HashSet<_> = codes.into_iter().collect(); diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml index dad4adb76c1..1c5052909f4 100644 --- a/vortex-cuda/Cargo.toml +++ b/vortex-cuda/Cargo.toml @@ -25,11 +25,13 @@ arc-swap = { workspace = true } arrow-data = { workspace = true, features = ["ffi"] } arrow-schema = { workspace = true, features = ["ffi"] } async-trait = { workspace = true } +bytes = { workspace = true } cudarc = { workspace = true, features = ["f16"] } fastlanes = { workspace = true } futures = { workspace = true, features = ["executor"] } kanal = { workspace = true } paste = { workspace = true } +tokio = { workspace = true, features = ["fs"] } tracing = { workspace = true } vortex-alp = { workspace = true } vortex-array = { workspace = true } diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs index dcd76d9ea11..000c388f639 100644 --- a/vortex-cuda/benches/for_cuda.rs +++ b/vortex-cuda/benches/for_cuda.rs @@ -27,7 +27,9 @@ use vortex_cuda::CudaSession; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; use vortex_dtype::NativePType; +use vortex_dtype::PType; use vortex_error::VortexExpect; +use vortex_fastlanes::BitPackedArray; use vortex_fastlanes::FoRArray; use vortex_scalar::Scalar; use vortex_session::VortexSession; @@ -36,20 +38,28 @@ const BENCH_ARGS: &[(usize, &str)] = &[(10_000_000, "10M")]; const REFERENCE_VALUE: u8 = 10; /// Creates a FoR array with the specified type and length. -fn make_for_array_typed(len: usize) -> FoRArray +fn make_for_array_typed(len: usize, bp: bool) -> FoRArray where T: NativePType + From + Add, Scalar: From, { let reference = >::from(REFERENCE_VALUE); let data: Vec = (0..len) - .map(|i| >::from((i % 256) as u8) + reference) + .map(|i| >::from((i % 256) as u8)) .collect(); let primitive_array = PrimitiveArray::new(Buffer::from(data), Validity::NonNullable).into_array(); - FoRArray::try_new(primitive_array, reference.into()).vortex_expect("failed to create FoR array") + if bp && T::PTYPE != PType::U8 { + let child = + BitPackedArray::encode(primitive_array.as_ref(), 8).vortex_expect("failed to bitpack"); + FoRArray::try_new(child.into_array(), reference.into()) + .vortex_expect("failed to create FoR array") + } else { + FoRArray::try_new(primitive_array, reference.into()) + .vortex_expect("failed to create FoR array") + } } /// Launches FoR decompression kernel and returns elapsed GPU time. @@ -95,10 +105,49 @@ where let mut group = c.benchmark_group("for_cuda"); group.sample_size(10); - for (len, len_str) in BENCH_ARGS { + for &(len, len_str) in BENCH_ARGS { + group.throughput(Throughput::Bytes((len * size_of::()) as u64)); + + let for_array = make_for_array_typed::(len, false); + + group.bench_with_input( + BenchmarkId::new("for", format!("{len_str}_{type_name}")), + &for_array, + |b, for_array| { + b.iter_custom(|iters| { + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context"); + + let mut total_time = Duration::ZERO; + + for _ in 0..iters { + let kernel_time = + launch_for_kernel_timed_typed::(for_array, &mut cuda_ctx) + .vortex_expect("kernel launch failed"); + total_time += kernel_time; + } + + total_time + }); + }, + ); + } + + group.finish(); +} + +fn benchmark_ffor_typed(c: &mut Criterion, type_name: &str) +where + T: NativePType + DeviceRepr + From + Add, + Scalar: From, +{ + let mut group = c.benchmark_group("ffor_cuda"); + group.sample_size(10); + + for &(len, len_str) in BENCH_ARGS { group.throughput(Throughput::Bytes((len * size_of::()) as u64)); - let for_array = make_for_array_typed::(*len); + let for_array = make_for_array_typed::(len, true); group.bench_with_input( BenchmarkId::new("for", format!("{len_str}_{type_name}")), @@ -134,7 +183,15 @@ fn benchmark_for(c: &mut Criterion) { benchmark_for_typed::(c, "u64"); } -criterion::criterion_group!(benches, benchmark_for); +/// Benchmark FOR+BP decompression for all types. +fn benchmark_ffor(c: &mut Criterion) { + benchmark_ffor_typed::(c, "u8"); + benchmark_ffor_typed::(c, "u16"); + benchmark_ffor_typed::(c, "u32"); + benchmark_ffor_typed::(c, "u64"); +} + +criterion::criterion_group!(benches, benchmark_for, benchmark_ffor); #[cuda_available] criterion::criterion_main!(benches); diff --git a/vortex-cuda/cuda_kernel_generator/mod.rs b/vortex-cuda/cuda_kernel_generator/mod.rs index 140eab2f2d8..5dc02f12795 100644 --- a/vortex-cuda/cuda_kernel_generator/mod.rs +++ b/vortex-cuda/cuda_kernel_generator/mod.rs @@ -20,23 +20,22 @@ fn generate_lane_decoder( writeln!( output, - "__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, unsigned int lane) {{" + "__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, const uint{bits}_t reference, unsigned int lane) {{" )?; output.indent(|output| { writeln!(output, "unsigned int LANE_COUNT = {lanes};")?; if bit_width == 0 { - writeln!(output, "uint{bits}_t zero = 0ULL;")?; writeln!(output)?; for row in 0..bits { - writeln!(output, "out[INDEX({row}, lane)] = zero;")?; + writeln!(output, "out[INDEX({row}, lane)] = reference;")?; } } else if bit_width == bits { writeln!(output)?; for row in 0..bits { writeln!( output, - "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];", + "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane] + reference;", )?; } } else { @@ -72,7 +71,7 @@ fn generate_lane_decoder( )?; } - writeln!(output, "out[INDEX({row}, lane)] = tmp;")?; + writeln!(output, "out[INDEX({row}, lane)] = tmp + reference;")?; } } Ok(()) @@ -93,14 +92,14 @@ fn generate_device_kernel_for_width( let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t"); let local_func_params = format!( - "(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, int thread_idx)" + "(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx)" ); writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?; output.indent(|output| { for thread_lane in 0..per_thread_loop_count { - writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, out, thread_idx * {per_thread_loop_count} + {thread_lane});")?; + writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, out, reference, thread_idx * {per_thread_loop_count} + {thread_lane});")?; } Ok(()) })?; @@ -116,8 +115,9 @@ fn generate_global_kernel_for_width( let bits = ::T; let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t"); - let func_params = - format!("(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out)"); + let func_params = format!( + "(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out, uint{bits}_t reference)" + ); writeln!( output, @@ -132,7 +132,7 @@ fn generate_global_kernel_for_width( )?; writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?; - writeln!(output, "_{func_name}(in, out, thread_idx);") + writeln!(output, "_{func_name}(in, out, reference, thread_idx);") })?; writeln!(output, "}}") diff --git a/vortex-cuda/kernels/src/bit_unpack_16.cu b/vortex-cuda/kernels/src/bit_unpack_16.cu index f86b685d77b..09dba79f62e 100644 --- a/vortex-cuda/kernels/src/bit_unpack_16.cu +++ b/vortex-cuda/kernels/src/bit_unpack_16.cu @@ -4,1060 +4,1059 @@ #include #include "fastlanes_common.cuh" -__device__ void _bit_unpack_16_0bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_0bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; - uint16_t zero = 0ULL; - out[INDEX(0, lane)] = zero; - out[INDEX(1, lane)] = zero; - out[INDEX(2, lane)] = zero; - out[INDEX(3, lane)] = zero; - out[INDEX(4, lane)] = zero; - out[INDEX(5, lane)] = zero; - out[INDEX(6, lane)] = zero; - out[INDEX(7, lane)] = zero; - out[INDEX(8, lane)] = zero; - out[INDEX(9, lane)] = zero; - out[INDEX(10, lane)] = zero; - out[INDEX(11, lane)] = zero; - out[INDEX(12, lane)] = zero; - out[INDEX(13, lane)] = zero; - out[INDEX(14, lane)] = zero; - out[INDEX(15, lane)] = zero; + out[INDEX(0, lane)] = reference; + out[INDEX(1, lane)] = reference; + out[INDEX(2, lane)] = reference; + out[INDEX(3, lane)] = reference; + out[INDEX(4, lane)] = reference; + out[INDEX(5, lane)] = reference; + out[INDEX(6, lane)] = reference; + out[INDEX(7, lane)] = reference; + out[INDEX(8, lane)] = reference; + out[INDEX(9, lane)] = reference; + out[INDEX(10, lane)] = reference; + out[INDEX(11, lane)] = reference; + out[INDEX(12, lane)] = reference; + out[INDEX(13, lane)] = reference; + out[INDEX(14, lane)] = reference; + out[INDEX(15, lane)] = reference; } -__device__ void _bit_unpack_16_0bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_0bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_0bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_0bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_0bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_0bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_0bw_32t(in, out, thread_idx); + _bit_unpack_16_0bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_1bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_1bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 1); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 1); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 1); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 1); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 1); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 1); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 1); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 1); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 1); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 1); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 1); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 1); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 1); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 1); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 1); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_1bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_1bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_1bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_1bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_1bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_1bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_1bw_32t(in, out, thread_idx); + _bit_unpack_16_1bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_2bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_2bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 2); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 2); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 2); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 2); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 2); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 2); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 2); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 0)) << 2; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 2); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 2); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 2); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 2); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 2); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 2); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 2); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_2bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_2bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_2bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_2bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_2bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_2bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_2bw_32t(in, out, thread_idx); + _bit_unpack_16_2bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_3bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_3bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 3); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 3); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 3); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 3); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 3); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 2)) << 1; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 3); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 3); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 3); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 3); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 1)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 3); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 3); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 3); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 3); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_3bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_3bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_3bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_3bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_3bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_3bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_3bw_32t(in, out, thread_idx); + _bit_unpack_16_3bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_4bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_4bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 4); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 4); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 4); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 0)) << 4; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 4); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 4); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 4); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 0)) << 4; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 4); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 4); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 4); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 0)) << 4; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 4); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 4); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 4); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_4bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_4bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_4bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_4bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_4bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_4bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_4bw_32t(in, out, thread_idx); + _bit_unpack_16_4bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_5bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_5bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 5); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 5); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 5); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 4)) << 1; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 5); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 5); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 3)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 5); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 5); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 2)) << 3; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 5); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 5); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 1)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 5); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 5); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_5bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_5bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_5bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_5bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_5bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_5bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_5bw_32t(in, out, thread_idx); + _bit_unpack_16_5bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_6bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_6bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 6); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 6); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 2)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 6); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 6); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 4)) << 2; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 6); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 0)) << 6; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 6); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 6); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 2)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 6); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 6); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 4)) << 2; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 6); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_6bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_6bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_6bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_6bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_6bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_6bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_6bw_32t(in, out, thread_idx); + _bit_unpack_16_6bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_7bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_7bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 7); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 7); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 5)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 7); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 3)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 7); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 1)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 7); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 7); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 6)) << 1; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 7); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 4)) << 3; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 7); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 2)) << 5; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 7); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 7); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_7bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_7bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_7bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_7bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_7bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_7bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_7bw_32t(in, out, thread_idx); + _bit_unpack_16_7bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_8bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_8bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 0)) << 8; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 8); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_8bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_8bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_8bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_8bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_8bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_8bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_8bw_32t(in, out, thread_idx); + _bit_unpack_16_8bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_9bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_9bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 9); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 7); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 2)) << 7; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 9); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 4)) << 5; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 9); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 6)) << 3; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 9); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 8)) << 1; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 1)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 9); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 3)) << 6; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 9); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 5)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 9); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 7)) << 2; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 9); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_9bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_9bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_9bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_9bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_9bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_9bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_9bw_32t(in, out, thread_idx); + _bit_unpack_16_9bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_10bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_10bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 10); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 4)) << 6; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 10); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 8)) << 2; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 2)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 10); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 6)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 0)) << 10; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 10); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 4)) << 6; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 10); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 8)) << 2; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 2)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 10); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 6)) << 4; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_10bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_10bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_10bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_10bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_10bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_10bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_10bw_32t(in, out, thread_idx); + _bit_unpack_16_10bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_11bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_11bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 11); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 6)) << 5; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 1)) << 10; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 11); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 7)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 9); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 2)) << 9; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 11); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 8)) << 3; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 3)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 11); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 9)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 7); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 4)) << 7; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 11); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 10)) << 1; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint16_t, 5)) << 6; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 11); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_11bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_11bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_11bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_11bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_11bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_11bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_11bw_32t(in, out, thread_idx); + _bit_unpack_16_11bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_12bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_12bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 12); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 8)) << 4; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 4)) << 8; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 0)) << 12; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 12); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 8)) << 4; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 4)) << 8; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 0)) << 12; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 12); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 8)) << 4; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 4)) << 8; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 0)) << 12; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 12); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint16_t, 8)) << 4; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint16_t, 4)) << 8; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_12bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_12bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_12bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_12bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_12bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_12bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_12bw_32t(in, out, thread_idx); + _bit_unpack_16_12bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_13bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_13bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 13); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 10)) << 3; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 7)) << 6; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 9); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 4)) << 9; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 1)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 13); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 11)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 8)) << 5; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 5)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 11); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 2)) << 11; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 13); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 12)) << 1; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint16_t, 9)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 7); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint16_t, 6)) << 7; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint16_t, 3)) << 10; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 13); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_13bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_13bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_13bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_13bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_13bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_13bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_13bw_32t(in, out, thread_idx); + _bit_unpack_16_13bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_14bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_14bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 14); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 12)) << 2; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 10)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 8)) << 6; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 6)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 4)) << 10; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 2)) << 12; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 0)) << 14; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint16_t, 14); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 12)) << 2; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 10)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint16_t, 8)) << 6; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint16_t, 6)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint16_t, 4)) << 10; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint16_t, 2)) << 12; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 14); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_14bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_14bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_14bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_14bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_14bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_14bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_14bw_32t(in, out, thread_idx); + _bit_unpack_16_14bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_15bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_15bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; uint16_t src; uint16_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint16_t, 15); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint16_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint16_t, 14)) << 1; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint16_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint16_t, 13)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint16_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint16_t, 12)) << 3; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint16_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint16_t, 11)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint16_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint16_t, 10)) << 5; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint16_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint16_t, 9)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint16_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint16_t, 8)) << 7; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint16_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint16_t, 7)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint16_t, 9); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint16_t, 6)) << 9; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint16_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint16_t, 5)) << 10; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint16_t, 11); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint16_t, 4)) << 11; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint16_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint16_t, 3)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint16_t, 13); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint16_t, 2)) << 13; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint16_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint16_t, 1)) << 14; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint16_t, 15); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; } -__device__ void _bit_unpack_16_15bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_15bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_15bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_15bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_15bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_15bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_15bw_32t(in, out, thread_idx); + _bit_unpack_16_15bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_16_16bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_16_16bw_lane(const uint16_t *__restrict in, uint16_t *__restrict out, const uint16_t reference, unsigned int lane) { unsigned int LANE_COUNT = 64; - out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane]; - out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane]; - out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane]; - out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane]; - out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane]; - out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane]; - out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane]; - out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane]; - out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane]; - out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane]; - out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane]; - out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane]; - out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane]; - out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane]; - out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane]; - out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane]; + out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane] + reference; + out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane] + reference; + out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane] + reference; + out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane] + reference; + out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane] + reference; + out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane] + reference; + out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane] + reference; + out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane] + reference; + out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane] + reference; + out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane] + reference; + out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane] + reference; + out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane] + reference; + out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane] + reference; + out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane] + reference; + out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane] + reference; + out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane] + reference; } -__device__ void _bit_unpack_16_16bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, int thread_idx) { - _bit_unpack_16_16bw_lane(in, out, thread_idx * 2 + 0); - _bit_unpack_16_16bw_lane(in, out, thread_idx * 2 + 1); +__device__ void _bit_unpack_16_16bw_32t(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx) { + _bit_unpack_16_16bw_lane(in, out, reference, thread_idx * 2 + 0); + _bit_unpack_16_16bw_lane(in, out, reference, thread_idx * 2 + 1); } -extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint16_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_16_16bw_32t(in, out, thread_idx); + _bit_unpack_16_16bw_32t(in, out, reference, thread_idx); } diff --git a/vortex-cuda/kernels/src/bit_unpack_32.cu b/vortex-cuda/kernels/src/bit_unpack_32.cu index 07aa4ae239e..64f285f0ae9 100644 --- a/vortex-cuda/kernels/src/bit_unpack_32.cu +++ b/vortex-cuda/kernels/src/bit_unpack_32.cu @@ -4,3603 +4,3602 @@ #include #include "fastlanes_common.cuh" -__device__ void _bit_unpack_32_0bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_0bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; - uint32_t zero = 0ULL; - out[INDEX(0, lane)] = zero; - out[INDEX(1, lane)] = zero; - out[INDEX(2, lane)] = zero; - out[INDEX(3, lane)] = zero; - out[INDEX(4, lane)] = zero; - out[INDEX(5, lane)] = zero; - out[INDEX(6, lane)] = zero; - out[INDEX(7, lane)] = zero; - out[INDEX(8, lane)] = zero; - out[INDEX(9, lane)] = zero; - out[INDEX(10, lane)] = zero; - out[INDEX(11, lane)] = zero; - out[INDEX(12, lane)] = zero; - out[INDEX(13, lane)] = zero; - out[INDEX(14, lane)] = zero; - out[INDEX(15, lane)] = zero; - out[INDEX(16, lane)] = zero; - out[INDEX(17, lane)] = zero; - out[INDEX(18, lane)] = zero; - out[INDEX(19, lane)] = zero; - out[INDEX(20, lane)] = zero; - out[INDEX(21, lane)] = zero; - out[INDEX(22, lane)] = zero; - out[INDEX(23, lane)] = zero; - out[INDEX(24, lane)] = zero; - out[INDEX(25, lane)] = zero; - out[INDEX(26, lane)] = zero; - out[INDEX(27, lane)] = zero; - out[INDEX(28, lane)] = zero; - out[INDEX(29, lane)] = zero; - out[INDEX(30, lane)] = zero; - out[INDEX(31, lane)] = zero; -} - -__device__ void _bit_unpack_32_0bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_0bw_lane(in, out, thread_idx * 1 + 0); -} - -extern "C" __global__ void bit_unpack_32_0bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { + out[INDEX(0, lane)] = reference; + out[INDEX(1, lane)] = reference; + out[INDEX(2, lane)] = reference; + out[INDEX(3, lane)] = reference; + out[INDEX(4, lane)] = reference; + out[INDEX(5, lane)] = reference; + out[INDEX(6, lane)] = reference; + out[INDEX(7, lane)] = reference; + out[INDEX(8, lane)] = reference; + out[INDEX(9, lane)] = reference; + out[INDEX(10, lane)] = reference; + out[INDEX(11, lane)] = reference; + out[INDEX(12, lane)] = reference; + out[INDEX(13, lane)] = reference; + out[INDEX(14, lane)] = reference; + out[INDEX(15, lane)] = reference; + out[INDEX(16, lane)] = reference; + out[INDEX(17, lane)] = reference; + out[INDEX(18, lane)] = reference; + out[INDEX(19, lane)] = reference; + out[INDEX(20, lane)] = reference; + out[INDEX(21, lane)] = reference; + out[INDEX(22, lane)] = reference; + out[INDEX(23, lane)] = reference; + out[INDEX(24, lane)] = reference; + out[INDEX(25, lane)] = reference; + out[INDEX(26, lane)] = reference; + out[INDEX(27, lane)] = reference; + out[INDEX(28, lane)] = reference; + out[INDEX(29, lane)] = reference; + out[INDEX(30, lane)] = reference; + out[INDEX(31, lane)] = reference; +} + +__device__ void _bit_unpack_32_0bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_0bw_lane(in, out, reference, thread_idx * 1 + 0); +} + +extern "C" __global__ void bit_unpack_32_0bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_0bw_32t(in, out, thread_idx); + _bit_unpack_32_0bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_1bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_1bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 1); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 1); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 1); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 1); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 1); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 1); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 1); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 1); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 1); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 1); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 1); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 1); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 1); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 1); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 1); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 1); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 1); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 1); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 1); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 1); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 1); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 1); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 1); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 1); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 1); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 1); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 1); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 1); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 1); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 1); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 1); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_1bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_1bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_1bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_1bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_1bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_1bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_1bw_32t(in, out, thread_idx); + _bit_unpack_32_1bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_2bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_2bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 2); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 2); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 2); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 2); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 2); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 2); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 2); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 2); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 2); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 2); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 2); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 2); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 2); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 2); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 2); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 0)) << 2; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 2); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 2); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 2); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 2); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 2); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 2); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 2); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 2); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 2); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 2); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 2); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 2); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 2); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 2); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 2); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_2bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_2bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_2bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_2bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_2bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_2bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_2bw_32t(in, out, thread_idx); + _bit_unpack_32_2bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_3bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_3bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 3); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 3); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 3); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 3); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 3); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 3); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 3); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 3); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 3); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 3); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 1)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 3); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 3); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 3); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 3); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 3); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 3); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 3); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 3); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 3); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 3); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 2)) << 1; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 3); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 3); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 3); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 3); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 3); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 3); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 3); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 3); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 3); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_3bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_3bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_3bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_3bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_3bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_3bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_3bw_32t(in, out, thread_idx); + _bit_unpack_32_3bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_4bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_4bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 4); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 4); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 4); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 4); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 4); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 4); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 4); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 0)) << 4; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 4); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 4); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 4); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 4); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 4); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 4); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 4); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 0)) << 4; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 4); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 4); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 4); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 4); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 4); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 4); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 4); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 4; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 4); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 4); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 4); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 4); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 4); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 4); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 4); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_4bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_4bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_4bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_4bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_4bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_4bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_4bw_32t(in, out, thread_idx); + _bit_unpack_32_4bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_5bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_5bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 5); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 5); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 5); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 5); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 5); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 5); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 3)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 5); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 5); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 5); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 5); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 5); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 1)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 5); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 5); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 5); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 5); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 5); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 5); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 4)) << 1; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 5); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 5); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 5); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 5); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 5); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 2)) << 3; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 5); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 5); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 5); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 5); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 5); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_5bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_5bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_5bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_5bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_5bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_5bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_5bw_32t(in, out, thread_idx); + _bit_unpack_32_5bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_6bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_6bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 6); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 6); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 6); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 6); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 6); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 4)) << 2; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 6); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 6); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 6); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 6); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 2)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 6); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 6); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 6); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 6); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 6; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 6); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 6); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 6); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 6); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 6); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 4)) << 2; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 6); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 6); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 6); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 6); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 2)) << 4; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 6); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 6); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 6); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 6); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_6bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_6bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_6bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_6bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_6bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_6bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_6bw_32t(in, out, thread_idx); + _bit_unpack_32_6bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_7bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_7bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 7); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 7); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 7); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 7); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 3)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 7); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 7); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 7); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 7); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 6)) << 1; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 7); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 7); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 7); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 2)) << 5; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 7); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 7); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 7); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 7); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 5)) << 2; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 7); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 7); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 7); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 1)) << 6; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 7); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 7); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 7); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 7); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 4)) << 3; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 7); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 7); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 7); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_7bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_7bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_7bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_7bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_7bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_7bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_7bw_32t(in, out, thread_idx); + _bit_unpack_32_7bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_8bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_8bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 0)) << 8; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 8); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 8); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 8); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_8bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_8bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_8bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_8bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_8bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_8bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_8bw_32t(in, out, thread_idx); + _bit_unpack_32_8bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_9bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_9bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 9); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 9); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 9); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 4)) << 5; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 9); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 9); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 9); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 8)) << 1; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 9); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 9); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 3)) << 6; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 9); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 9); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 9); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 7)) << 2; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 9); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 9); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 2)) << 7; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 9); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 9); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 9); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 6)) << 3; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 9); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 9); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 1)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 9); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 9); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 9); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 5)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 9); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 9); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_9bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_9bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_9bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_9bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_9bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_9bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_9bw_32t(in, out, thread_idx); + _bit_unpack_32_9bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_10bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_10bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 10); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 10); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 10); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 8)) << 2; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 10); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 10); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 6)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 10); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 10); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 4)) << 6; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 10); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 10); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 2)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 10); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 10); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 0)) << 10; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 10); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 10); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 10); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 8)) << 2; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 10); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 10); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 6)) << 4; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 10); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 10); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 4)) << 6; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 10); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 10); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 2)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 10); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 10); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_10bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_10bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_10bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_10bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_10bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_10bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_10bw_32t(in, out, thread_idx); + _bit_unpack_32_10bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_11bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_11bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 11); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 11); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 1)) << 10; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 11); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 11); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 2)) << 9; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 11); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 11); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 3)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 11); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 11); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 4)) << 7; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 11); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 11); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 5)) << 6; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 11); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 11); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 6)) << 5; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 11); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 11); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 7)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 11); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 11); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 8)) << 3; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 11); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 11); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 9)) << 2; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 11); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 11); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 10)) << 1; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 11); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_11bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_11bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_11bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_11bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_11bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_11bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_11bw_32t(in, out, thread_idx); + _bit_unpack_32_11bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_12bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_12bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 12); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 12); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 4)) << 8; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 12); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 12); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 8)) << 4; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 12); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 12; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 12); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 12); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 4)) << 8; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 12); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 12); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 8)) << 4; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 12); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 0)) << 12; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 12); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 12); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 4)) << 8; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 12); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 12); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 8)) << 4; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 12); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 0)) << 12; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 12); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 12); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 4)) << 8; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 12); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 12); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 4; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 12); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_12bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_12bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_12bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_12bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_12bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_12bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_12bw_32t(in, out, thread_idx); + _bit_unpack_32_12bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_13bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_13bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 13); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 13); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 7)) << 6; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 13); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 1)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 13); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 13); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 8)) << 5; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 13); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 2)) << 11; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 13); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 13); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 9)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 13); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 3)) << 10; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 13); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 13); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 10)) << 3; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 13); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 4)) << 9; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 13); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 13); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 11)) << 2; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 13); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 5)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 13); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 13); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 12)) << 1; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 13); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 6)) << 7; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 13); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_13bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_13bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_13bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_13bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_13bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_13bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_13bw_32t(in, out, thread_idx); + _bit_unpack_32_13bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_14bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_14bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 14); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 14); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 10)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 14); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 6)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 14); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 2)) << 12; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 14); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 14); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 12)) << 2; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 14); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 8)) << 6; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 14); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 4)) << 10; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 14); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 0)) << 14; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 14); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 14); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 10)) << 4; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 14); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 6)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 14); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 2)) << 12; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 14); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 14); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 12)) << 2; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 14); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 8)) << 6; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 14); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 10; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 14); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_14bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_14bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_14bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_14bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_14bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_14bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_14bw_32t(in, out, thread_idx); + _bit_unpack_32_14bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_15bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_15bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 15); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 15); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 13)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 15); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 11)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 15); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 9)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 15); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 7)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 15); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 5)) << 10; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 15); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 3)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 15); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 1)) << 14; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 15); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 15); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 14)) << 1; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 15); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 12)) << 3; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 15); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 10)) << 5; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 15); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 7; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 15); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 6)) << 9; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 15); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 11; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 15); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 2)) << 13; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 15); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_15bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_15bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_15bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_15bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_15bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_15bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_15bw_32t(in, out, thread_idx); + _bit_unpack_32_15bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_16bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_16bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 0)) << 16; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 16); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_16bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_16bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_16bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_16bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_16bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_16bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_16bw_32t(in, out, thread_idx); + _bit_unpack_32_16bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_17bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_17bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 17); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 2)) << 15; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 17); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 4)) << 13; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 17); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 6)) << 11; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 17); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 8)) << 9; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 17); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 10)) << 7; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 17); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 12)) << 5; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 17); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 14)) << 3; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 17); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 16)) << 1; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 1)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 17); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 3)) << 14; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 17); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 5)) << 12; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 17); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 7)) << 10; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 17); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 9)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 17); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 11)) << 6; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 17); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 13)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 17); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 15)) << 2; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_17bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_17bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_17bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_17bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_17bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_17bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 17 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_17bw_32t(in, out, thread_idx); + _bit_unpack_32_17bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_18bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_18bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 18); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 4)) << 14; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 18); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 8)) << 10; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 18); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 12)) << 6; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 18); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 16)) << 2; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 2)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 18); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 6)) << 12; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 18); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 10)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 18); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 14)) << 4; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 0)) << 18; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 18); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 4)) << 14; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 18); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 10; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 18); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 12)) << 6; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 18); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 16)) << 2; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 2)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 18); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 6)) << 12; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 18); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 10)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 18); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 14)) << 4; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_18bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_18bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_18bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_18bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_18bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_18bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 18 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_18bw_32t(in, out, thread_idx); + _bit_unpack_32_18bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_19bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_19bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 19); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 6)) << 13; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 19); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 12)) << 7; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 19); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 18)) << 1; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 5)) << 14; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 19); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 11)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 19); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 17)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 4)) << 15; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 19); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 10)) << 9; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 19); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 16)) << 3; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 3)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 19); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 9)) << 10; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 19); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 15)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 2)) << 17; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 19); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 8)) << 11; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 19); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 14)) << 5; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 1)) << 18; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 19); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 7)) << 12; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 19); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 13)) << 6; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_19bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_19bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_19bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_19bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_19bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_19bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 19 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_19bw_32t(in, out, thread_idx); + _bit_unpack_32_19bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_20bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_20bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 20); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 8)) << 12; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 20); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 16)) << 4; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 4)) << 16; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 20); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 12)) << 8; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 0)) << 20; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 20); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 8)) << 12; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 20); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 16)) << 4; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 4)) << 16; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 20); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 12)) << 8; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 0)) << 20; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 20); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 12; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 20); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 16)) << 4; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 16; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 20); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 12)) << 8; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 0)) << 20; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 20); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 8)) << 12; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 20); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 16)) << 4; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 4)) << 16; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 20); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 12)) << 8; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_20bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_20bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_20bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_20bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_20bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_20bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 20 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_20bw_32t(in, out, thread_idx); + _bit_unpack_32_20bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_21bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_21bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 21); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 10)) << 11; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 21); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 20)) << 1; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 9)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 21); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 19)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 8)) << 13; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 21); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 18)) << 3; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 7)) << 14; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 21); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 17)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 6)) << 15; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 21); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 16)) << 5; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 5)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 21); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 15)) << 6; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 17; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 21); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 14)) << 7; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 3)) << 18; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 21); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 13)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 2)) << 19; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 21); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 12)) << 9; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 1)) << 20; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 21); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 11)) << 10; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_21bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_21bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_21bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_21bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_21bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_21bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 21 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_21bw_32t(in, out, thread_idx); + _bit_unpack_32_21bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_22bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_22bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 22); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 12)) << 10; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 2)) << 20; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 22); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 14)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 4)) << 18; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 22); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 16)) << 6; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 6)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 22); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 18)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 8)) << 14; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 22); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 20)) << 2; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 10)) << 12; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 0)) << 22; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 22); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 12)) << 10; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 2)) << 20; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 22); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 14)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 4)) << 18; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 22); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 16)) << 6; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 6)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 22); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 18)) << 4; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 8)) << 14; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 22); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 20)) << 2; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 10)) << 12; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_22bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_22bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_22bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_22bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_22bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_22bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 22 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_22bw_32t(in, out, thread_idx); + _bit_unpack_32_22bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_23bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_23bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 23); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 14)) << 9; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 5)) << 18; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 23); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 19)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 10)) << 13; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 1)) << 22; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 23); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 15)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 6)) << 17; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 23); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 20)) << 3; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 11)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 2)) << 21; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 23); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 16)) << 7; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 7)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 23); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 21)) << 2; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 12)) << 11; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 3)) << 20; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 23); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 17)) << 6; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 8)) << 15; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 23); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 22)) << 1; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 13)) << 10; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 4)) << 19; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 23); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 18)) << 5; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 9)) << 14; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 23); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_23bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_23bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_23bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_23bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_23bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_23bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 23 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_23bw_32t(in, out, thread_idx); + _bit_unpack_32_23bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_24bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_24bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 0)) << 24; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 24); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 16)) << 8; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 8)) << 16; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_24bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_24bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_24bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_24bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_24bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_24bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 24 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_24bw_32t(in, out, thread_idx); + _bit_unpack_32_24bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_25bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_25bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 25); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 18)) << 7; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 11)) << 14; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 4)) << 21; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 25); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 22)) << 3; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 15)) << 10; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 8)) << 17; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 1)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 25); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 19)) << 6; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 12)) << 13; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 5)) << 20; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 25); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 23)) << 2; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 16)) << 9; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 9)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 23); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 2)) << 23; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 25); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 20)) << 5; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 13)) << 12; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 6)) << 19; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 25); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 24)) << 1; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 17)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 10)) << 15; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 3)) << 22; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 25); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 21)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 14)) << 11; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 7)) << 18; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 25); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_25bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_25bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_25bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_25bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_25bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_25bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 25 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_25bw_32t(in, out, thread_idx); + _bit_unpack_32_25bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_26bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_26bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 26); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 20)) << 6; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 14)) << 12; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 8)) << 18; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 2)) << 24; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 26); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 22)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 16)) << 10; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 10)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 4)) << 22; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 26); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 24)) << 2; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 18)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 12)) << 14; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 6)) << 20; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 0)) << 26; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 26); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 20)) << 6; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 14)) << 12; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 8)) << 18; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 2)) << 24; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 26); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 22)) << 4; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 16)) << 10; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 10)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 4)) << 22; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 26); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 24)) << 2; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 18)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 12)) << 14; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 6)) << 20; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_26bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_26bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_26bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_26bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_26bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_26bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 26 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_26bw_32t(in, out, thread_idx); + _bit_unpack_32_26bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_27bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_27bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 27); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 22)) << 5; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 17)) << 10; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 12)) << 15; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 7)) << 20; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 25); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 2)) << 25; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 27); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 24)) << 3; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 19)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 14)) << 13; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 9)) << 18; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 23); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 4)) << 23; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 27); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 26)) << 1; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 21)) << 6; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 16)) << 11; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 11)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 6)) << 21; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 1)) << 26; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 27); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 23)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 18)) << 9; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 13)) << 14; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 8)) << 19; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 3)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 27); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 25)) << 2; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 20)) << 7; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 15)) << 12; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 10)) << 17; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint32_t, 5)) << 22; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 27); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_27bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_27bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_27bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_27bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_27bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_27bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 27 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_27bw_32t(in, out, thread_idx); + _bit_unpack_32_27bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_28bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_28bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 28); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 24)) << 4; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 20)) << 8; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 16)) << 12; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 12)) << 16; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 8)) << 20; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 4)) << 24; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 0)) << 28; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 28); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 24)) << 4; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 20)) << 8; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 16)) << 12; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 12)) << 16; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 8)) << 20; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 24; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 0)) << 28; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 28); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 24)) << 4; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 20)) << 8; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 16)) << 12; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 12)) << 16; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 8)) << 20; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 4)) << 24; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 0)) << 28; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 28); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 24)) << 4; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 20)) << 8; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 16)) << 12; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 12)) << 16; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint32_t, 8)) << 20; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint32_t, 4)) << 24; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_28bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_28bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_28bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_28bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_28bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_28bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 28 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_28bw_32t(in, out, thread_idx); + _bit_unpack_32_28bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_29bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_29bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 29); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 26)) << 3; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 23)) << 6; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 20)) << 9; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 17)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 14)) << 15; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 11)) << 18; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 8)) << 21; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 5)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 27); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 2)) << 27; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 29); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 28)) << 1; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 25)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 22)) << 7; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 19)) << 10; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 16)) << 13; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 13)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 10)) << 19; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 7)) << 22; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 25); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 4)) << 25; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 1)) << 28; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 29); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 27)) << 2; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 24)) << 5; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 21)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 18)) << 11; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 15)) << 14; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 12)) << 17; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint32_t, 9)) << 20; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 23); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint32_t, 6)) << 23; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint32_t, 3)) << 26; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 29); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_29bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_29bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_29bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_29bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_29bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_29bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 29 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_29bw_32t(in, out, thread_idx); + _bit_unpack_32_29bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_30bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_30bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 30); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 28)) << 2; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 26)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 24)) << 6; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 22)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 20)) << 10; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 18)) << 12; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 16)) << 14; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 14)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 12)) << 18; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 10)) << 20; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 8)) << 22; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 6)) << 24; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 4)) << 26; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 2)) << 28; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 30); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 0)) << 30; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint32_t, 30); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 28)) << 2; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 26)) << 4; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 24)) << 6; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 22)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 20)) << 10; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 18)) << 12; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 16)) << 14; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 14)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 12)) << 18; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 10)) << 20; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint32_t, 8)) << 22; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint32_t, 6)) << 24; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint32_t, 4)) << 26; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint32_t, 2)) << 28; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 30); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_30bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_30bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_30bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_30bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_30bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_30bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 30 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_30bw_32t(in, out, thread_idx); + _bit_unpack_32_30bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_31bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_31bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; uint32_t src; uint32_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint32_t, 31); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint32_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint32_t, 30)) << 1; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint32_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint32_t, 29)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint32_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint32_t, 28)) << 3; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint32_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint32_t, 27)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint32_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint32_t, 26)) << 5; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint32_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint32_t, 25)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint32_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint32_t, 24)) << 7; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint32_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint32_t, 23)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint32_t, 9); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint32_t, 22)) << 9; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint32_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint32_t, 21)) << 10; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint32_t, 11); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint32_t, 20)) << 11; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint32_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint32_t, 19)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint32_t, 13); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint32_t, 18)) << 13; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint32_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint32_t, 17)) << 14; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint32_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint32_t, 16)) << 15; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint32_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint32_t, 15)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint32_t, 17); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint32_t, 14)) << 17; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint32_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint32_t, 13)) << 18; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint32_t, 19); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint32_t, 12)) << 19; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint32_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint32_t, 11)) << 20; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint32_t, 21); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint32_t, 10)) << 21; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint32_t, 22); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint32_t, 9)) << 22; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint32_t, 23); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint32_t, 8)) << 23; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint32_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint32_t, 7)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint32_t, 25); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint32_t, 6)) << 25; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint32_t, 26); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint32_t, 5)) << 26; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint32_t, 27); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint32_t, 4)) << 27; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint32_t, 28); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint32_t, 3)) << 28; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint32_t, 29); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint32_t, 2)) << 29; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint32_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint32_t, 1)) << 30; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint32_t, 31); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; } -__device__ void _bit_unpack_32_31bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_31bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_32_31bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_31bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_32_31bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_32_31bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 31 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_31bw_32t(in, out, thread_idx); + _bit_unpack_32_31bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_32_32bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_32_32bw_lane(const uint32_t *__restrict in, uint32_t *__restrict out, const uint32_t reference, unsigned int lane) { unsigned int LANE_COUNT = 32; - out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane]; - out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane]; - out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane]; - out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane]; - out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane]; - out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane]; - out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane]; - out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane]; - out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane]; - out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane]; - out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane]; - out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane]; - out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane]; - out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane]; - out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane]; - out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane]; - out[INDEX(16, lane)] = in[LANE_COUNT * 16 + lane]; - out[INDEX(17, lane)] = in[LANE_COUNT * 17 + lane]; - out[INDEX(18, lane)] = in[LANE_COUNT * 18 + lane]; - out[INDEX(19, lane)] = in[LANE_COUNT * 19 + lane]; - out[INDEX(20, lane)] = in[LANE_COUNT * 20 + lane]; - out[INDEX(21, lane)] = in[LANE_COUNT * 21 + lane]; - out[INDEX(22, lane)] = in[LANE_COUNT * 22 + lane]; - out[INDEX(23, lane)] = in[LANE_COUNT * 23 + lane]; - out[INDEX(24, lane)] = in[LANE_COUNT * 24 + lane]; - out[INDEX(25, lane)] = in[LANE_COUNT * 25 + lane]; - out[INDEX(26, lane)] = in[LANE_COUNT * 26 + lane]; - out[INDEX(27, lane)] = in[LANE_COUNT * 27 + lane]; - out[INDEX(28, lane)] = in[LANE_COUNT * 28 + lane]; - out[INDEX(29, lane)] = in[LANE_COUNT * 29 + lane]; - out[INDEX(30, lane)] = in[LANE_COUNT * 30 + lane]; - out[INDEX(31, lane)] = in[LANE_COUNT * 31 + lane]; -} - -__device__ void _bit_unpack_32_32bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) { - _bit_unpack_32_32bw_lane(in, out, thread_idx * 1 + 0); -} - -extern "C" __global__ void bit_unpack_32_32bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out) { + out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane] + reference; + out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane] + reference; + out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane] + reference; + out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane] + reference; + out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane] + reference; + out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane] + reference; + out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane] + reference; + out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane] + reference; + out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane] + reference; + out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane] + reference; + out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane] + reference; + out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane] + reference; + out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane] + reference; + out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane] + reference; + out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane] + reference; + out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane] + reference; + out[INDEX(16, lane)] = in[LANE_COUNT * 16 + lane] + reference; + out[INDEX(17, lane)] = in[LANE_COUNT * 17 + lane] + reference; + out[INDEX(18, lane)] = in[LANE_COUNT * 18 + lane] + reference; + out[INDEX(19, lane)] = in[LANE_COUNT * 19 + lane] + reference; + out[INDEX(20, lane)] = in[LANE_COUNT * 20 + lane] + reference; + out[INDEX(21, lane)] = in[LANE_COUNT * 21 + lane] + reference; + out[INDEX(22, lane)] = in[LANE_COUNT * 22 + lane] + reference; + out[INDEX(23, lane)] = in[LANE_COUNT * 23 + lane] + reference; + out[INDEX(24, lane)] = in[LANE_COUNT * 24 + lane] + reference; + out[INDEX(25, lane)] = in[LANE_COUNT * 25 + lane] + reference; + out[INDEX(26, lane)] = in[LANE_COUNT * 26 + lane] + reference; + out[INDEX(27, lane)] = in[LANE_COUNT * 27 + lane] + reference; + out[INDEX(28, lane)] = in[LANE_COUNT * 28 + lane] + reference; + out[INDEX(29, lane)] = in[LANE_COUNT * 29 + lane] + reference; + out[INDEX(30, lane)] = in[LANE_COUNT * 30 + lane] + reference; + out[INDEX(31, lane)] = in[LANE_COUNT * 31 + lane] + reference; +} + +__device__ void _bit_unpack_32_32bw_32t(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx) { + _bit_unpack_32_32bw_lane(in, out, reference, thread_idx * 1 + 0); +} + +extern "C" __global__ void bit_unpack_32_32bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 32 / sizeof(uint32_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_32_32bw_32t(in, out, thread_idx); + _bit_unpack_32_32bw_32t(in, out, reference, thread_idx); } diff --git a/vortex-cuda/kernels/src/bit_unpack_64.cu b/vortex-cuda/kernels/src/bit_unpack_64.cu index ea221192059..236e46b61b8 100644 --- a/vortex-cuda/kernels/src/bit_unpack_64.cu +++ b/vortex-cuda/kernels/src/bit_unpack_64.cu @@ -4,13331 +4,13330 @@ #include #include "fastlanes_common.cuh" -__device__ void _bit_unpack_64_0bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_0bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; - uint64_t zero = 0ULL; - out[INDEX(0, lane)] = zero; - out[INDEX(1, lane)] = zero; - out[INDEX(2, lane)] = zero; - out[INDEX(3, lane)] = zero; - out[INDEX(4, lane)] = zero; - out[INDEX(5, lane)] = zero; - out[INDEX(6, lane)] = zero; - out[INDEX(7, lane)] = zero; - out[INDEX(8, lane)] = zero; - out[INDEX(9, lane)] = zero; - out[INDEX(10, lane)] = zero; - out[INDEX(11, lane)] = zero; - out[INDEX(12, lane)] = zero; - out[INDEX(13, lane)] = zero; - out[INDEX(14, lane)] = zero; - out[INDEX(15, lane)] = zero; - out[INDEX(16, lane)] = zero; - out[INDEX(17, lane)] = zero; - out[INDEX(18, lane)] = zero; - out[INDEX(19, lane)] = zero; - out[INDEX(20, lane)] = zero; - out[INDEX(21, lane)] = zero; - out[INDEX(22, lane)] = zero; - out[INDEX(23, lane)] = zero; - out[INDEX(24, lane)] = zero; - out[INDEX(25, lane)] = zero; - out[INDEX(26, lane)] = zero; - out[INDEX(27, lane)] = zero; - out[INDEX(28, lane)] = zero; - out[INDEX(29, lane)] = zero; - out[INDEX(30, lane)] = zero; - out[INDEX(31, lane)] = zero; - out[INDEX(32, lane)] = zero; - out[INDEX(33, lane)] = zero; - out[INDEX(34, lane)] = zero; - out[INDEX(35, lane)] = zero; - out[INDEX(36, lane)] = zero; - out[INDEX(37, lane)] = zero; - out[INDEX(38, lane)] = zero; - out[INDEX(39, lane)] = zero; - out[INDEX(40, lane)] = zero; - out[INDEX(41, lane)] = zero; - out[INDEX(42, lane)] = zero; - out[INDEX(43, lane)] = zero; - out[INDEX(44, lane)] = zero; - out[INDEX(45, lane)] = zero; - out[INDEX(46, lane)] = zero; - out[INDEX(47, lane)] = zero; - out[INDEX(48, lane)] = zero; - out[INDEX(49, lane)] = zero; - out[INDEX(50, lane)] = zero; - out[INDEX(51, lane)] = zero; - out[INDEX(52, lane)] = zero; - out[INDEX(53, lane)] = zero; - out[INDEX(54, lane)] = zero; - out[INDEX(55, lane)] = zero; - out[INDEX(56, lane)] = zero; - out[INDEX(57, lane)] = zero; - out[INDEX(58, lane)] = zero; - out[INDEX(59, lane)] = zero; - out[INDEX(60, lane)] = zero; - out[INDEX(61, lane)] = zero; - out[INDEX(62, lane)] = zero; - out[INDEX(63, lane)] = zero; -} - -__device__ void _bit_unpack_64_0bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_0bw_lane(in, out, thread_idx * 1 + 0); -} - -extern "C" __global__ void bit_unpack_64_0bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { + out[INDEX(0, lane)] = reference; + out[INDEX(1, lane)] = reference; + out[INDEX(2, lane)] = reference; + out[INDEX(3, lane)] = reference; + out[INDEX(4, lane)] = reference; + out[INDEX(5, lane)] = reference; + out[INDEX(6, lane)] = reference; + out[INDEX(7, lane)] = reference; + out[INDEX(8, lane)] = reference; + out[INDEX(9, lane)] = reference; + out[INDEX(10, lane)] = reference; + out[INDEX(11, lane)] = reference; + out[INDEX(12, lane)] = reference; + out[INDEX(13, lane)] = reference; + out[INDEX(14, lane)] = reference; + out[INDEX(15, lane)] = reference; + out[INDEX(16, lane)] = reference; + out[INDEX(17, lane)] = reference; + out[INDEX(18, lane)] = reference; + out[INDEX(19, lane)] = reference; + out[INDEX(20, lane)] = reference; + out[INDEX(21, lane)] = reference; + out[INDEX(22, lane)] = reference; + out[INDEX(23, lane)] = reference; + out[INDEX(24, lane)] = reference; + out[INDEX(25, lane)] = reference; + out[INDEX(26, lane)] = reference; + out[INDEX(27, lane)] = reference; + out[INDEX(28, lane)] = reference; + out[INDEX(29, lane)] = reference; + out[INDEX(30, lane)] = reference; + out[INDEX(31, lane)] = reference; + out[INDEX(32, lane)] = reference; + out[INDEX(33, lane)] = reference; + out[INDEX(34, lane)] = reference; + out[INDEX(35, lane)] = reference; + out[INDEX(36, lane)] = reference; + out[INDEX(37, lane)] = reference; + out[INDEX(38, lane)] = reference; + out[INDEX(39, lane)] = reference; + out[INDEX(40, lane)] = reference; + out[INDEX(41, lane)] = reference; + out[INDEX(42, lane)] = reference; + out[INDEX(43, lane)] = reference; + out[INDEX(44, lane)] = reference; + out[INDEX(45, lane)] = reference; + out[INDEX(46, lane)] = reference; + out[INDEX(47, lane)] = reference; + out[INDEX(48, lane)] = reference; + out[INDEX(49, lane)] = reference; + out[INDEX(50, lane)] = reference; + out[INDEX(51, lane)] = reference; + out[INDEX(52, lane)] = reference; + out[INDEX(53, lane)] = reference; + out[INDEX(54, lane)] = reference; + out[INDEX(55, lane)] = reference; + out[INDEX(56, lane)] = reference; + out[INDEX(57, lane)] = reference; + out[INDEX(58, lane)] = reference; + out[INDEX(59, lane)] = reference; + out[INDEX(60, lane)] = reference; + out[INDEX(61, lane)] = reference; + out[INDEX(62, lane)] = reference; + out[INDEX(63, lane)] = reference; +} + +__device__ void _bit_unpack_64_0bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_0bw_lane(in, out, reference, thread_idx * 1 + 0); +} + +extern "C" __global__ void bit_unpack_64_0bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_0bw_16t(in, out, thread_idx); + _bit_unpack_64_0bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_1bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_1bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 1); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 1); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 1); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 1); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 1); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 1); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 1); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 1); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 1); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 1); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 1); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 1); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 1); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 1); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 1); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 1); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 1); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 1); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 1); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 1); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 1); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 1); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 1); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 1); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 1); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 1); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 1); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 1); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 1); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 1); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 1); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 1); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 1); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 1); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 1); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 1); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 1); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 1); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 1); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 1); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 1); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 1); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 1); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 1); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 1); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 1); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 1); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 1); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 1); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 1); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 1); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 1); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 1); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 1); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 1); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 1); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 1); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 1); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 1); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 1); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 1); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 1); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 1); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_1bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_1bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_1bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_1bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_1bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_1bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_1bw_16t(in, out, thread_idx); + _bit_unpack_64_1bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_2bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_2bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 2); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 2); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 2); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 2); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 2); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 2); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 2); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 2); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 2); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 2); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 2); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 2); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 2); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 2); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 2); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 2); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 2); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 2); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 2); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 2); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 2); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 2); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 2); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 2); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 2); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 2); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 2); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 2); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 2); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 2); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 2); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 0)) << 2; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 2); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 2); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 2); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 2); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 2); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 2); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 2); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 2); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 2); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 2); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 2); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 2); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 2); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 2); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 2); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 2); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 2); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 2); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 2); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 2); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 2); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 2); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 2); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 2); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 2); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 2); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 2); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 2); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 2); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 2); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 2); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_2bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_2bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_2bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_2bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_2bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_2bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_2bw_16t(in, out, thread_idx); + _bit_unpack_64_2bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_3bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_3bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 3); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 3); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 3); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 3); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 3); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 3); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 3); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 3); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 3); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 3); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 3); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 3); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 3); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 3); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 3); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 3); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 3); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 3); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 3); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 3); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 3); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 2)) << 1; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 3); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 3); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 3); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 3); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 3); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 3); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 3); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 3); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 3); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 3); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 3); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 3); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 3); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 3); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 3); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 3); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 3); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 3); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 3); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 3); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 1)) << 2; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 3); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 3); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 3); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 3); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 3); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 3); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 3); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 3); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 3); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 3); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 3); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 3); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 3); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 3); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 3); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 3); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 3); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 3); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 3); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 3); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_3bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_3bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_3bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_3bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_3bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_3bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_3bw_16t(in, out, thread_idx); + _bit_unpack_64_3bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_4bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_4bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 4); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 4); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 4); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 4); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 4); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 4); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 4); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 4); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 4); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 4); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 4); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 4); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 4); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 4); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 4); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 0)) << 4; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 4); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 4); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 4); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 4); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 4); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 4); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 4); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 4); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 4); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 4); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 4); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 4); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 4); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 4); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 4); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 0)) << 4; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 4); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 4); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 4); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 4); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 4); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 4); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 4); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 4); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 4); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 4); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 4); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 4); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 4); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 4); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 4); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 4; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 4); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 4); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 4); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 4); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 4); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 4); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 4); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 4); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 4); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 4); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 4); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 4); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 4); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 4); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 4); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_4bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_4bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_4bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_4bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_4bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_4bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_4bw_16t(in, out, thread_idx); + _bit_unpack_64_4bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_5bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_5bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 5); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 5); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 5); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 5); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 5); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 5); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 5); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 5); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 5); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 5); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 5); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 5); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 1)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 5); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 5); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 5); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 5); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 5); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 5); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 5); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 5); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 5); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 5); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 5); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 5); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 2)) << 3; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 5); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 5); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 5); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 5); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 5); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 5); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 5); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 5); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 5); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 5); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 5); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 5); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 3)) << 2; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 5); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 5); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 5); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 5); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 5); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 5); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 5); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 5); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 5); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 5); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 5); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 5); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 1; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 5); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 5); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 5); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 5); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 5); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 5); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 5); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 5); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 5); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 5); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 5); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_5bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_5bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_5bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_5bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_5bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_5bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_5bw_16t(in, out, thread_idx); + _bit_unpack_64_5bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_6bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_6bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 6); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 6); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 6); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 6); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 6); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 6); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 6); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 6); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 6); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 6); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 2)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 6); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 6); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 6); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 6); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 6); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 6); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 6); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 6); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 6); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 6); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 2; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 6); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 6); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 6); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 6); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 6); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 6); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 6); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 6); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 6); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 6; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 6); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 6); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 6); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 6); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 6); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 6); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 6); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 6); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 6); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 6); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 2)) << 4; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 6); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 6); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 6); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 6); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 6); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 6); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 6); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 6); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 6); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 6); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 2; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 6); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 6); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 6); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 6); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 6); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 6); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 6); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 6); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 6); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_6bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_6bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_6bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_6bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_6bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_6bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_6bw_16t(in, out, thread_idx); + _bit_unpack_64_6bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_7bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_7bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 7); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 7); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 7); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 7); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 7); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 7); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 7); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 7); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 7); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 6)) << 1; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 7); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 7); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 7); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 7); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 7); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 7); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 7); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 7); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 5)) << 2; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 7); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 7); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 7); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 7); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 7); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 7); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 7); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 7); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 4)) << 3; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 7); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 7); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 7); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 7); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 7); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 7); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 7); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 7); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 3)) << 4; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 7); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 7); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 7); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 7); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 7); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 7); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 7); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 7); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 2)) << 5; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 7); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 7); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 7); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 7); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 7); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 7); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 7); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 7); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 1)) << 6; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 7); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 7); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 7); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 7); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 7); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 7); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 7); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 7); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_7bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_7bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_7bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_7bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_7bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_7bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_7bw_16t(in, out, thread_idx); + _bit_unpack_64_7bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_8bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_8bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 8; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 8); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 8); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 8); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 8); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 8); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 8); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 8); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_8bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_8bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_8bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_8bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_8bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_8bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_8bw_16t(in, out, thread_idx); + _bit_unpack_64_8bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_9bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_9bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 9); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 9); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 9); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 9); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 9); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 9); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 9); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 8)) << 1; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 9); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 9); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 9); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 9); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 9); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 9); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 7)) << 2; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 9); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 9); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 9); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 9); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 9); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 9); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 6)) << 3; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 9); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 9); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 9); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 9); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 9); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 9); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 5)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 9); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 9); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 9); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 9); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 9); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 9); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 5; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 9); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 9); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 9); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 9); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 9); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 9); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 3)) << 6; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 9); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 9); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 9); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 9); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 9); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 9); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 7; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 9); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 9); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 9); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 9); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 9); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 9); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 1)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 9); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 9); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 9); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 9); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 9); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 9); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_9bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_9bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_9bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_9bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_9bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_9bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_9bw_16t(in, out, thread_idx); + _bit_unpack_64_9bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_10bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_10bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 10); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 10); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 10); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 10); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 10); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 10); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 6)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 10); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 10); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 10); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 10); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 10); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 2)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 10); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 10); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 10); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 10); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 10); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 10); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 8)) << 2; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 10); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 10); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 10); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 10); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 10); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 6; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 10); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 10); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 10); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 10); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 10); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 10; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 10); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 10); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 10); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 10); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 10); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 10); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 4; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 10); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 10); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 10); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 10); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 10); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 8; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 10); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 10); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 10); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 10); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 10); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 10); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 2; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 10); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 10); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 10); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 10); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 10); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 4)) << 6; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 10); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 10); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 10); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 10); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 10); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_10bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_10bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_10bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_10bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_10bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_10bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_10bw_16t(in, out, thread_idx); + _bit_unpack_64_10bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_11bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_11bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 11); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 11); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 11); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 11); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 11); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 2)) << 9; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 11); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 11); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 11); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 11); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 11); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 7; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 11); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 11); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 11); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 11); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 11); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 6)) << 5; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 11); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 11); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 11); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 11); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 11); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 3; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 11); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 11); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 11); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 11); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 11); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 1; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 11); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 11); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 11); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 11); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 1)) << 10; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 11); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 11); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 11); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 11); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 11); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 3)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 11); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 11); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 11); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 11); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 11); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 5)) << 6; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 11); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 11); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 11); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 11); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 11); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 7)) << 4; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 11); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 11); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 11); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 11); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 11); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 9)) << 2; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 11); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 11); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 11); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 11); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_11bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_11bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_11bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_11bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_11bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_11bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_11bw_16t(in, out, thread_idx); + _bit_unpack_64_11bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_12bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_12bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 12); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 12); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 12); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 12); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 12); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 8)) << 4; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 12); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 12); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 12); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 12); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 8; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 12); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 12); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 12); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 12); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 12; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 12); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 12); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 12); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 12); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 12); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 4; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 12); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 12); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 12); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 12); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 8; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 12); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 12); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 12); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 12); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 12; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 12); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 12); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 12); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 12); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 12); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 8)) << 4; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 12); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 12); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 12); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 12); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 4)) << 8; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 12); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 12); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 12); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 12); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 12; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 12); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 12); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 12); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 12); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 12); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 4; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 12); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 12); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 12); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 12); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 4)) << 8; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 12); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 12); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 12); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 12); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_12bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_12bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_12bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_12bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_12bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_12bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_12bw_16t(in, out, thread_idx); + _bit_unpack_64_12bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_13bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_13bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 13); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 13); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 13); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 13); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 1)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 13); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 13); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 13); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 13); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 2)) << 11; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 13); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 13); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 13); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 13); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 3)) << 10; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 13); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 13); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 13); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 13); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 9; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 13); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 13); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 13); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 13); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 5)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 13); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 13); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 13); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 13); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 7; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 13); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 13); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 13); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 13); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 7)) << 6; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 13); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 13); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 13); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 13); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 5; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 13); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 13); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 13); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 13); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 9)) << 4; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 13); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 13); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 13); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 13); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 10)) << 3; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 13); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 13); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 13); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 13); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 11)) << 2; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 13); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 13); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 13); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 13); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 1; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 13); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 13); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 13); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_13bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_13bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_13bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_13bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_13bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_13bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_13bw_16t(in, out, thread_idx); + _bit_unpack_64_13bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_14bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_14bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 14); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 14); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 14); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 14); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 6)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 14); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 14); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 14); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 14); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 12)) << 2; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 14); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 14); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 14); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 4)) << 10; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 14); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 14); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 14); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 14); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 10)) << 4; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 14); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 14); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 14); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 2)) << 12; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 14); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 14); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 14); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 14); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 8)) << 6; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 14); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 14); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 14); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 14; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 14); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 14); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 14); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 14); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 6)) << 8; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 14); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 14); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 14); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 14); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 12)) << 2; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 14); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 14); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 14); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 4)) << 10; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 14); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 14); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 14); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 14); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 4; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 14); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 14); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 14); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 2)) << 12; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 14); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 14); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 14); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 14); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 6; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 14); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 14); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 14); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_14bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_14bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_14bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_14bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_14bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_14bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_14bw_16t(in, out, thread_idx); + _bit_unpack_64_14bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_15bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_15bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 15); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 15); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 15); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 15); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 11)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 15); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 15); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 15); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 7)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 15); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 15); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 15); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 3)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 15); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 15); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 15); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 15); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 1; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 15); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 15); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 15); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 5; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 15); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 15); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 15); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 9; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 15); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 15); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 15); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 13; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 15); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 15); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 15); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 15); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 13)) << 2; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 15); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 15); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 15); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 9)) << 6; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 15); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 15); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 15); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 5)) << 10; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 15); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 15); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 15); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 1)) << 14; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 15); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 15); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 15); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 15); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 3; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 15); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 15); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 15); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 7; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 15); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 15); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 15); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 11; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 15); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 15); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 15); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_15bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_15bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_15bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_15bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_15bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_15bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_15bw_16t(in, out, thread_idx); + _bit_unpack_64_15bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_16bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_16bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 16; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 16); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 16); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 16); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_16bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_16bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_16bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_16bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_16bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_16bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_16bw_16t(in, out, thread_idx); + _bit_unpack_64_16bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_17bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_17bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 17); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 17); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 17); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 4)) << 13; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 17); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 17); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 17); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 8)) << 9; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 17); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 17); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 17); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 12)) << 5; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 17); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 17); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 17); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 16)) << 1; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 17); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 17); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 3)) << 14; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 17); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 17); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 17); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 7)) << 10; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 17); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 17); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 17); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 11)) << 6; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 17); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 17); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 17); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 15)) << 2; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 17); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 17); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 2)) << 15; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 17); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 17); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 17); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 6)) << 11; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 17); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 17); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 17); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 7; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 17); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 17); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 17); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 14)) << 3; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 17); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 17); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 1)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 17); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 17); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 17); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 5)) << 12; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 17); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 17); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 17); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 9)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 17); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 17); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 17); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 13)) << 4; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 17); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 17); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_17bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_17bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_17bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_17bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_17bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_17bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 17 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_17bw_16t(in, out, thread_idx); + _bit_unpack_64_17bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_18bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_18bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 18); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 18); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 18); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 8)) << 10; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 18); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 18); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 18); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 16)) << 2; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 18); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 18); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 6)) << 12; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 18); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 18); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 18); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 4; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 18); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 18); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 14; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 18); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 18); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 18); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 12)) << 6; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 18); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 18); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 18); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 18); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 18); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 10)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 18); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 18); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 18; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 18); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 18); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 18); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 10; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 18); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 18); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 18); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 2; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 18); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 18); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 6)) << 12; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 18); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 18); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 18); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 14)) << 4; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 18); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 18); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 14; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 18); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 18); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 18); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 12)) << 6; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 18); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 18); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 18); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 18); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 18); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 10)) << 8; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 18); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 18); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_18bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_18bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_18bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_18bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_18bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_18bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 18 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_18bw_16t(in, out, thread_idx); + _bit_unpack_64_18bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_19bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_19bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 19); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 19); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 19); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 12)) << 7; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 19); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 19); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 5)) << 14; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 19); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 19); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 19); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 17)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 19); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 19); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 10)) << 9; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 19); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 19); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 3)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 19); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 19); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 19); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 15)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 19); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 19); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 8)) << 11; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 19); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 19); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 1)) << 18; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 19); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 19); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 19); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 13)) << 6; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 19); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 19); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 6)) << 13; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 19); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 19); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 19); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 18)) << 1; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 19); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 19); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 11)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 19); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 19); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 4)) << 15; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 19); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 19); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 19); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 16)) << 3; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 19); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 19); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 9)) << 10; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 19); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 19); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 17; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 19); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 19); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 19); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 14)) << 5; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 19); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 19); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 7)) << 12; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 19); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 19); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_19bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_19bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_19bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_19bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_19bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_19bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 19 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_19bw_16t(in, out, thread_idx); + _bit_unpack_64_19bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_20bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_20bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 20); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 20); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 20); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 16)) << 4; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 20); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 20); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 12)) << 8; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 20); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 20); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 8)) << 12; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 20); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 20); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 16; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 20); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 20); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 20; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 20); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 20); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 20); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 16)) << 4; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 20); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 20); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 12)) << 8; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 20); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 20); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 12; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 20); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 20); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 4)) << 16; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 20); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 20); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 0)) << 20; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 20); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 20); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 20); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 4; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 20); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 20); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 8; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 20); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 20); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 12; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 20); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 20); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 16; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 20); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 20); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 20; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 20); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 20); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 20); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 16)) << 4; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 20); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 20); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 8; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 20); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 20); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 8)) << 12; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 20); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 20); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 4)) << 16; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 20); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 20); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_20bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_20bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_20bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_20bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_20bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_20bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 20 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_20bw_16t(in, out, thread_idx); + _bit_unpack_64_20bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_21bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_21bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 21); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 21); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 21); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 20)) << 1; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 21); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 21); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 19)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 21); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 21); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 18)) << 3; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 21); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 21); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 17)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 21); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 21); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 5; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 21); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 21); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 15)) << 6; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 21); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 21); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 14)) << 7; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 21); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 21); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 13)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 21); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 21); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 12)) << 9; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 21); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 21); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 11)) << 10; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 21); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 21); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 11; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 21); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 21); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 9)) << 12; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 21); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 21); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 13; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 21); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 21); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 7)) << 14; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 21); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 21); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 6)) << 15; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 21); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 21); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 5)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 21); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 21); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 17; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 21); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 21); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 3)) << 18; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 21); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 21); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 2)) << 19; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 21); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 21); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 1)) << 20; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 21); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 21); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_21bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_21bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_21bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_21bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_21bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_21bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 21 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_21bw_16t(in, out, thread_idx); + _bit_unpack_64_21bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_22bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_22bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 22); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 22); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 2)) << 20; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 22); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 22); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 18; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 22); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 22); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 6)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 22); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 22); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 14; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 22); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 22); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 12; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 22); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 22); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 12)) << 10; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 22); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 22); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 14)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 22); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 22); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 16)) << 6; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 22); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 22); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 18)) << 4; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 22); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 22); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 2; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 22); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 0)) << 22; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 22); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 22); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 2)) << 20; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 22); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 22); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 4)) << 18; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 22); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 22); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 6)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 22); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 22); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 8)) << 14; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 22); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 22); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 10)) << 12; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 22); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 22); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 10; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 22); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 22); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 14)) << 8; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 22); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 22); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 16)) << 6; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 22); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 22); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 18)) << 4; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 22); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 22); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 20)) << 2; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 22); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_22bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_22bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_22bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_22bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_22bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_22bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 22 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_22bw_16t(in, out, thread_idx); + _bit_unpack_64_22bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_23bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_23bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 23); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 23); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 5)) << 18; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 23); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 23); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 10)) << 13; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 23); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 23); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 15)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 23); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 23); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 20)) << 3; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 23); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 2)) << 21; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 23); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 23); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 7)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 23); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 23); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 12)) << 11; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 23); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 23); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 17)) << 6; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 23); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 23); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 22)) << 1; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 23); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 4)) << 19; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 23); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 23); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 9)) << 14; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 23); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 23); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 14)) << 9; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 23); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 23); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 19)) << 4; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 23); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 1)) << 22; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 23); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 23); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 6)) << 17; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 23); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 23); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 11)) << 12; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 23); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 23); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 16)) << 7; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 23); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 23); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 21)) << 2; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 23); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 3)) << 20; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 23); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 23); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 8)) << 15; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 23); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 23); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 13)) << 10; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 23); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 23); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 18)) << 5; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 23); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_23bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_23bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_23bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_23bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_23bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_23bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 23 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_23bw_16t(in, out, thread_idx); + _bit_unpack_64_23bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_24bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_24bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 24; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 24); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 24); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 8)) << 16; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 24); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 24); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 16)) << 8; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 24); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_24bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_24bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_24bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_24bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_24bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_24bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 24 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_24bw_16t(in, out, thread_idx); + _bit_unpack_64_24bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_25bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_25bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 25); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 25); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 11)) << 14; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 25); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 25); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 22)) << 3; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 25); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 8)) << 17; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 25); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 25); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 19)) << 6; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 25); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 5)) << 20; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 25); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 25); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 16)) << 9; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 25); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 23; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 25); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 25); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 13)) << 12; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 25); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 25); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 24)) << 1; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 25); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 10)) << 15; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 25); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 25); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 21)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 25); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 7)) << 18; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 25); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 25); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 18)) << 7; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 25); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 21; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 25); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 25); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 15)) << 10; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 25); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 1)) << 24; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 25); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 25); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 13; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 25); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 25); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 23)) << 2; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 25); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 9)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 25); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 25); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 20)) << 5; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 25); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 19; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 25); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 25); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 17)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 25); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 3)) << 22; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 25); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 25); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 14)) << 11; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 25); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_25bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_25bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_25bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_25bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_25bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_25bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 25 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_25bw_16t(in, out, thread_idx); + _bit_unpack_64_25bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_26bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_26bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 26); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 26); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 14)) << 12; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 26); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 2)) << 24; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 26); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 26); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 16)) << 10; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 26); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 22; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 26); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 26); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 18)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 26); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 20; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 26); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 26); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 20)) << 6; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 26); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 18; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 26); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 26); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 22)) << 4; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 26); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 10)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 26); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 26); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 24)) << 2; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 26); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 14; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 26); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 0)) << 26; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 26); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 26); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 14)) << 12; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 26); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 2)) << 24; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 26); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 26); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 16)) << 10; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 26); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 22; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 26); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 26); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 18)) << 8; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 26); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 6)) << 20; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 26); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 26); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 20)) << 6; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 26); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 8)) << 18; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 26); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 26); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 22)) << 4; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 26); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 10)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 26); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 26); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 2; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 26); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 12)) << 14; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 26); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_26bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_26bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_26bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_26bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_26bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_26bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 26 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_26bw_16t(in, out, thread_idx); + _bit_unpack_64_26bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_27bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_27bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 27); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 27); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 17)) << 10; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 27); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 7)) << 20; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 27); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 27); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 24)) << 3; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 27); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 13; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 27); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 23; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 27); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 27); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 21)) << 6; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 27); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 11)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 27); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 1)) << 26; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 27); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 27); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 18)) << 9; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 27); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 19; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 27); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 27); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 25)) << 2; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 27); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 15)) << 12; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 27); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 5)) << 22; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 27); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 27); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 22)) << 5; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 27); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 12)) << 15; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 27); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 25; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 27); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 27); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 19)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 27); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 9)) << 18; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 27); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 27); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 26)) << 1; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 27); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 16)) << 11; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 27); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 21; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 27); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 27); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 23)) << 4; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 27); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 13)) << 14; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 27); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 3)) << 24; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 27); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 27); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 7; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 27); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 10)) << 17; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 27); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_27bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_27bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_27bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_27bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_27bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_27bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 27 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_27bw_16t(in, out, thread_idx); + _bit_unpack_64_27bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_28bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_28bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 28); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 28); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 20)) << 8; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 28); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 12)) << 16; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 28); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 4)) << 24; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 28); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 28); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 24)) << 4; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 28); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 12; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 28); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 8)) << 20; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 28); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 28; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 28); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 28); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 20)) << 8; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 28); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 12)) << 16; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 28); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 4)) << 24; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 28); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 28); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 24)) << 4; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 28); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 16)) << 12; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 28); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 20; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 28); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 0)) << 28; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 28); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 28); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 20)) << 8; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 28); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 12)) << 16; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 28); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 24; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 28); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 28); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 24)) << 4; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 28); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 16)) << 12; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 28); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 8)) << 20; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 28); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 28; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 28); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 28); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 20)) << 8; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 28); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 12)) << 16; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 28); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 4)) << 24; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 28); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 28); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 24)) << 4; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 28); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 12; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 28); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 8)) << 20; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 28); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_28bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_28bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_28bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_28bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_28bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_28bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 28 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_28bw_16t(in, out, thread_idx); + _bit_unpack_64_28bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_29bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_29bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 29); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 29); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 23)) << 6; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 29); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 17)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 29); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 11)) << 18; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 29); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 5)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 29); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 29); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 28)) << 1; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 29); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 22)) << 7; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 29); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 16)) << 13; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 29); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 10)) << 19; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 29); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 4)) << 25; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 29); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 29); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 27)) << 2; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 29); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 21)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 29); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 15)) << 14; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 29); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 9)) << 20; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 29); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 3)) << 26; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 29); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 29); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 26)) << 3; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 29); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 20)) << 9; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 29); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 14)) << 15; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 29); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 8)) << 21; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 29); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 2)) << 27; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 29); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 29); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 25)) << 4; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 29); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 19)) << 10; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 29); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 13)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 29); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 7)) << 22; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 29); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 1)) << 28; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 29); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 29); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 24)) << 5; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 29); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 18)) << 11; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 29); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 12)) << 17; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 29); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 6)) << 23; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 29); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_29bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_29bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_29bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_29bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_29bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_29bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 29 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_29bw_16t(in, out, thread_idx); + _bit_unpack_64_29bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_30bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_30bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 30); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 30); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 26)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 30); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 22)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 30); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 18)) << 12; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 30); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 30); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 20; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 30); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 24; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 30); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 28; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 30); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 30); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 28)) << 2; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 30); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 24)) << 6; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 30); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 10; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 30); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 14; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 30); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 18; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 30); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 22; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 30); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 26; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 30); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 30; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 30); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 30); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 26)) << 4; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 30); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 22)) << 8; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 30); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 18)) << 12; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 30); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 14)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 30); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 10)) << 20; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 30); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 24; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 30); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 2)) << 28; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 30); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 30); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 28)) << 2; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 30); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 6; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 30); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 10; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 30); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 14; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 30); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 12)) << 18; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 30); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 22; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 30); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 4)) << 26; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 30); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_30bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_30bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_30bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_30bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_30bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_30bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 30 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_30bw_16t(in, out, thread_idx); + _bit_unpack_64_30bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_31bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_31bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 31); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 31); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 29)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 31); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 27)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 31); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 25)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 31); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 23)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 31); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 21)) << 10; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 31); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 19)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 31); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 17)) << 14; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 31); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 15)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 31); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 13)) << 18; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 31); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 11)) << 20; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 31); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 9)) << 22; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 31); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 7)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 31); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 5)) << 26; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 31); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 3)) << 28; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 31); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 1)) << 30; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 31); - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 31); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 30)) << 1; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 31); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 28)) << 3; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 31); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 26)) << 5; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 31); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 24)) << 7; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 31); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 22)) << 9; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 31); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 20)) << 11; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 31); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 18)) << 13; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 31); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 16)) << 15; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 31); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 14)) << 17; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 31); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 12)) << 19; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 31); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 10)) << 21; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 31); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 8)) << 23; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 31); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 6)) << 25; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 31); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 4)) << 27; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 31); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 2)) << 29; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 31); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_31bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_31bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_31bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_31bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_31bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_31bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 31 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_31bw_16t(in, out, thread_idx); + _bit_unpack_64_31bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_32bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_32bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 0)) << 32; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 32); - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_32bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_32bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_32bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_32bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_32bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_32bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 32 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_32bw_16t(in, out, thread_idx); + _bit_unpack_64_32bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_33bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_33bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 33); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 2)) << 31; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 33); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 29; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 33); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 6)) << 27; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 33); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 25; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 33); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 23; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 33); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 12)) << 21; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 33); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 14)) << 19; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 33); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 16)) << 17; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 33); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 18)) << 15; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 33); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 13; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 33); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 22)) << 11; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 33); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 24)) << 9; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 33); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 26)) << 7; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 33); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 28)) << 5; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 33); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 30)) << 3; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 33); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 32)) << 1; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 1)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 33); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 3)) << 30; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 33); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 5)) << 28; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 33); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 7)) << 26; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 33); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 9)) << 24; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 33); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 11)) << 22; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 33); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 13)) << 20; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 33); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 15)) << 18; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 33); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 17)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 33); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 19)) << 14; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 33); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 21)) << 12; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 33); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 23)) << 10; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 33); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 25)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 33); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 27)) << 6; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 33); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 29)) << 4; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 33); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 31)) << 2; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_33bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_33bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_33bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_33bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_33bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_33bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 33 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_33bw_16t(in, out, thread_idx); + _bit_unpack_64_33bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_34bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_34bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 34); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 4)) << 30; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 34); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 8)) << 26; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 34); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 12)) << 22; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 34); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 16)) << 18; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 34); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 20)) << 14; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 34); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 24)) << 10; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 34); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 28)) << 6; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 34); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 32)) << 2; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 2)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 34); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 6)) << 28; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 34); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 24; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 34); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 14)) << 20; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 34); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 18)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 34); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 22)) << 12; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 34); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 26)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 34); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 30)) << 4; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 0)) << 34; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 34); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 4)) << 30; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 34); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 8)) << 26; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 34); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 12)) << 22; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 34); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 16)) << 18; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 34); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 20)) << 14; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 34); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 24)) << 10; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 34); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 28)) << 6; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 34); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 32)) << 2; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 2)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 34); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 6)) << 28; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 34); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 10)) << 24; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 34); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 14)) << 20; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 34); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 18)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 34); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 22)) << 12; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 34); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 26)) << 8; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 34); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 30)) << 4; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_34bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_34bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_34bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_34bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_34bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_34bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 34 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_34bw_16t(in, out, thread_idx); + _bit_unpack_64_34bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_35bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_35bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 35); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 6)) << 29; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 35); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 12)) << 23; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 35); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 18)) << 17; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 35); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 24)) << 11; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 35); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 30)) << 5; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 1)) << 34; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 35); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 7)) << 28; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 35); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 13)) << 22; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 35); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 19)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 35); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 25)) << 10; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 35); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 31)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 2)) << 33; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 35); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 27; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 35); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 14)) << 21; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 35); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 20)) << 15; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 35); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 26)) << 9; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 35); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 32)) << 3; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 3)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 35); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 9)) << 26; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 35); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 15)) << 20; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 35); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 21)) << 14; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 35); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 27)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 35); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 33)) << 2; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 4)) << 31; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 35); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 10)) << 25; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 35); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 19; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 35); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 22)) << 13; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 35); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 28)) << 7; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 35); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 34)) << 1; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 5)) << 30; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 35); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 11)) << 24; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 35); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 17)) << 18; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 35); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 23)) << 12; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 35); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 29)) << 6; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_35bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_35bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_35bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_35bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_35bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_35bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 35 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_35bw_16t(in, out, thread_idx); + _bit_unpack_64_35bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_36bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_36bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 36); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 8)) << 28; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 36); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 16)) << 20; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 36); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 24)) << 12; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 36); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 32)) << 4; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 32; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 36); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 12)) << 24; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 36); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 20)) << 16; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 36); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 28)) << 8; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 36; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 36); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 28; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 36); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 20; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 36); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 24)) << 12; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 36); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 32)) << 4; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 32; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 36); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 12)) << 24; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 36); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 20)) << 16; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 36); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 28)) << 8; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 0)) << 36; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 36); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 8)) << 28; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 36); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 16)) << 20; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 36); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 24)) << 12; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 36); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 32)) << 4; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 4)) << 32; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 36); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 12)) << 24; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 36); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 16; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 36); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 28)) << 8; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 0)) << 36; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 36); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 28; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 36); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 16)) << 20; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 36); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 24)) << 12; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 36); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 32)) << 4; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 4)) << 32; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 36); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 12)) << 24; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 36); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 20)) << 16; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 36); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 28)) << 8; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_36bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_36bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_36bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_36bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_36bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_36bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 36 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_36bw_16t(in, out, thread_idx); + _bit_unpack_64_36bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_37bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_37bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 37); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 10)) << 27; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 37); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 20)) << 17; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 37); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 30)) << 7; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 3)) << 34; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 37); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 13)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 37); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 23)) << 14; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 37); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 33)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 6)) << 31; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 37); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 16)) << 21; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 37); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 26)) << 11; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 37); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 36)) << 1; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 9)) << 28; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 37); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 19)) << 18; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 37); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 29)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 2)) << 35; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 37); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 12)) << 25; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 37); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 22)) << 15; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 37); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 32)) << 5; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 5)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 37); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 15)) << 22; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 37); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 25)) << 12; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 37); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 35)) << 2; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 8)) << 29; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 37); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 18)) << 19; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 37); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 28)) << 9; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 1)) << 36; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 37); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 11)) << 26; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 37); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 21)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 37); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 31)) << 6; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 4)) << 33; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 37); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 14)) << 23; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 37); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 24)) << 13; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 37); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 34)) << 3; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 7)) << 30; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 37); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 17)) << 20; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 37); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 27)) << 10; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_37bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_37bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_37bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_37bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_37bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_37bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 37 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_37bw_16t(in, out, thread_idx); + _bit_unpack_64_37bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_38bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_38bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 38); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 12)) << 26; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 38); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 24)) << 14; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 38); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 36)) << 2; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 10)) << 28; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 38); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 22)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 38); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 34)) << 4; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 8)) << 30; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 38); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 20)) << 18; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 38); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 32)) << 6; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 6)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 38); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 18)) << 20; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 38); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 30)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 4)) << 34; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 38); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 16)) << 22; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 38); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 28)) << 10; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 36; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 38); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 14)) << 24; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 38); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 26)) << 12; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 0)) << 38; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 38); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 12)) << 26; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 38); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 24)) << 14; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 38); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 36)) << 2; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 10)) << 28; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 38); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 22)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 38); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 34)) << 4; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 8)) << 30; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 38); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 20)) << 18; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 38); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 32)) << 6; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 6)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 38); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 18)) << 20; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 38); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 30)) << 8; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 4)) << 34; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 38); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 16)) << 22; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 38); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 28)) << 10; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 2)) << 36; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 38); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 14)) << 24; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 38); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 26)) << 12; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_38bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_38bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_38bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_38bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_38bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_38bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 38 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_38bw_16t(in, out, thread_idx); + _bit_unpack_64_38bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_39bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_39bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 39); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 14)) << 25; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 39); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 28)) << 11; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 3)) << 36; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 39); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 17)) << 22; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 39); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 31)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 6)) << 33; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 39); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 20)) << 19; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 39); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 34)) << 5; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 9)) << 30; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 39); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 23)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 39); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 37)) << 2; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 27; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 39); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 26)) << 13; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 1)) << 38; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 39); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 15)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 39); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 29)) << 10; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 35; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 39); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 18)) << 21; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 39); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 32)) << 7; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 7)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 39); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 21)) << 18; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 39); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 35)) << 4; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 10)) << 29; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 39); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 15; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 39); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 38)) << 1; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 13)) << 26; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 39); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 27)) << 12; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 2)) << 37; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 39); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 16)) << 23; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 39); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 30)) << 9; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 5)) << 34; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 39); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 19)) << 20; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 39); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 33)) << 6; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 8)) << 31; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 39); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 22)) << 17; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 39); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 36)) << 3; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 11)) << 28; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 39); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 25)) << 14; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_39bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_39bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_39bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_39bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_39bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_39bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 39 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_39bw_16t(in, out, thread_idx); + _bit_unpack_64_39bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_40bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_40bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 0)) << 40; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 40); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 16)) << 24; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 40); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 32)) << 8; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 8)) << 32; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 40); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 16; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_40bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_40bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_40bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_40bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_40bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_40bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 40 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_40bw_16t(in, out, thread_idx); + _bit_unpack_64_40bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_41bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_41bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 41); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 18)) << 23; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 41); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 36)) << 5; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 13)) << 28; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 41); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 31)) << 10; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 8)) << 33; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 41); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 26)) << 15; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 3)) << 38; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 41); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 21)) << 20; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 41); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 39)) << 2; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 16)) << 25; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 41); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 34)) << 7; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 11)) << 30; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 41); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 29)) << 12; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 6)) << 35; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 41); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 24)) << 17; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 1)) << 40; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 41); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 19)) << 22; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 41); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 37)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 14)) << 27; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 41); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 32)) << 9; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 9)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 41); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 27)) << 14; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 4)) << 37; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 41); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 22)) << 19; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 41); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 40)) << 1; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 17)) << 24; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 41); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 35)) << 6; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 12)) << 29; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 41); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 30)) << 11; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 7)) << 34; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 41); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 25)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 2)) << 39; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 41); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 20)) << 21; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 41); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 38)) << 3; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 15)) << 26; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 41); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 33)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 10)) << 31; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 41); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 28)) << 13; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 5)) << 36; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 41); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 23)) << 18; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_41bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_41bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_41bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_41bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_41bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_41bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 41 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_41bw_16t(in, out, thread_idx); + _bit_unpack_64_41bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_42bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_42bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 42); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 20)) << 22; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 42); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 40)) << 2; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 18)) << 24; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 42); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 38)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 26; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 42); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 36)) << 6; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 14)) << 28; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 42); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 34)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 12)) << 30; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 42); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 32)) << 10; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 42); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 30)) << 12; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 34; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 42); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 28)) << 14; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 6)) << 36; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 42); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 26)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 38; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 42); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 24)) << 18; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 2)) << 40; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 42); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 22)) << 20; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 42; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 42); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 20)) << 22; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 42); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 40)) << 2; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 18)) << 24; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 42); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 38)) << 4; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 26; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 42); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 36)) << 6; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 14)) << 28; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 42); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 34)) << 8; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 12)) << 30; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 42); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 32)) << 10; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 10)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 42); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 30)) << 12; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 8)) << 34; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 42); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 28)) << 14; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 6)) << 36; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 42); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 26)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 4)) << 38; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 42); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 18; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 2)) << 40; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 42); - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 22)) << 20; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_42bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_42bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_42bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_42bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_42bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_42bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 42 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_42bw_16t(in, out, thread_idx); + _bit_unpack_64_42bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_43bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_43bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 43); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 22)) << 21; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 1)) << 42; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 43); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 23)) << 20; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 2)) << 41; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 43); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 24)) << 19; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 3)) << 40; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 43); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 25)) << 18; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 4)) << 39; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 43); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 26)) << 17; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 5)) << 38; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 43); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 27)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 6)) << 37; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 43); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 28)) << 15; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 7)) << 36; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 43); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 29)) << 14; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 8)) << 35; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 43); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 30)) << 13; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 9)) << 34; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 43); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 31)) << 12; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 10)) << 33; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 43); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 32)) << 11; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 11)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 43); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 33)) << 10; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 12)) << 31; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 43); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 34)) << 9; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 13)) << 30; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 43); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 35)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 14)) << 29; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 43); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 36)) << 7; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 15)) << 28; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 43); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 37)) << 6; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 16)) << 27; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 43); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 38)) << 5; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 17)) << 26; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 43); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 39)) << 4; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 18)) << 25; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 43); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 40)) << 3; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 19)) << 24; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 43); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 41)) << 2; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 20)) << 23; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 43); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 42)) << 1; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 21)) << 22; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_43bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_43bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_43bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_43bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_43bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_43bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 43 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_43bw_16t(in, out, thread_idx); + _bit_unpack_64_43bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_44bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_44bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 44); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 24)) << 20; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 4)) << 40; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 44); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 28)) << 16; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 8)) << 36; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 44); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 32)) << 12; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 12)) << 32; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 44); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 36)) << 8; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 16)) << 28; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 44); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 40)) << 4; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 24; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 0)) << 44; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 44); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 24)) << 20; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 4)) << 40; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 44); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 28)) << 16; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 8)) << 36; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 44); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 32)) << 12; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 32; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 44); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 36)) << 8; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 16)) << 28; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 44); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 40)) << 4; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 20)) << 24; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 0)) << 44; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 44); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 24)) << 20; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 4)) << 40; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 44); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 28)) << 16; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 8)) << 36; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 44); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 32)) << 12; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 12)) << 32; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 44); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 36)) << 8; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 16)) << 28; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 44); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 40)) << 4; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 20)) << 24; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 0)) << 44; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 44); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 24)) << 20; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 4)) << 40; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 44); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 28)) << 16; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 8)) << 36; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 44); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 32)) << 12; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 12)) << 32; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 44); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 36)) << 8; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 16)) << 28; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 44); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 40)) << 4; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 20)) << 24; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_44bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_44bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_44bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_44bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_44bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_44bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 44 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_44bw_16t(in, out, thread_idx); + _bit_unpack_64_44bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_45bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_45bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 45); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 26)) << 19; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 7)) << 38; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 45); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 33)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 31; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 45); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 40)) << 5; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 21)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 43; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 45); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 28)) << 17; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 9)) << 36; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 45); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 35)) << 10; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 29; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 45); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 42)) << 3; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 23)) << 22; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 41; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 45); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 30)) << 15; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 11)) << 34; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 45); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 37)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 18)) << 27; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 45); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 44)) << 1; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 25)) << 20; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 39; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 45); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 32)) << 13; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 13)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 45); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 39)) << 6; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 25; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 1)) << 44; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 45); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 27)) << 18; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 37; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 45); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 34)) << 11; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 15)) << 30; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 45); - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 41)) << 4; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 22)) << 23; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 3)) << 42; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 45); - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 29)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 10)) << 35; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 45); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 36)) << 9; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 17)) << 28; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 45); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 43)) << 2; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 21; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 5)) << 40; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 45); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 31)) << 14; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 12)) << 33; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 45); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 38)) << 7; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 19)) << 26; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_45bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_45bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_45bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_45bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_45bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_45bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 45 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_45bw_16t(in, out, thread_idx); + _bit_unpack_64_45bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_46bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_46bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 46); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 28)) << 18; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 10)) << 36; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 46); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 38)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 20)) << 26; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 2)) << 44; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 46); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 30)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 12)) << 34; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 46); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 40)) << 6; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 22)) << 24; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 4)) << 42; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 46); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 32)) << 14; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 14)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 46); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 42)) << 4; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 24)) << 22; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 6)) << 40; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 46); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 34)) << 12; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 16)) << 30; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 46); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 44)) << 2; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 26)) << 20; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 8)) << 38; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 46); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 36)) << 10; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 18)) << 28; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 0)) << 46; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 46); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 28)) << 18; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 10)) << 36; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 46); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 38)) << 8; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 20)) << 26; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 2)) << 44; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 46); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 30)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 12)) << 34; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 46); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 40)) << 6; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 22)) << 24; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 4)) << 42; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 46); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 32)) << 14; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 14)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 46); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 42)) << 4; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 24)) << 22; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 6)) << 40; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 46); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 34)) << 12; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 16)) << 30; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 46); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 44)) << 2; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 26)) << 20; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 8)) << 38; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 46); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 36)) << 10; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 18)) << 28; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_46bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_46bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_46bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_46bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_46bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_46bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 46 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_46bw_16t(in, out, thread_idx); + _bit_unpack_64_46bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_47bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_47bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 47); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 30)) << 17; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 13)) << 34; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 47); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 43)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 26)) << 21; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 9)) << 38; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 47); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 39)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 22)) << 25; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 5)) << 42; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 47); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 35)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 18)) << 29; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 1)) << 46; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 47); - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 31)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 14)) << 33; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 47); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 44)) << 3; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 27)) << 20; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 10)) << 37; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 47); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 40)) << 7; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 23)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 6)) << 41; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 47); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 36)) << 11; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 19)) << 28; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 2)) << 45; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 47); - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 32)) << 15; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 15)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 47); - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 45)) << 2; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 28)) << 19; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 11)) << 36; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 47); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 41)) << 6; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 24)) << 23; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 7)) << 40; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 47); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 37)) << 10; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 20)) << 27; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 3)) << 44; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 47); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 33)) << 14; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 16)) << 31; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 47); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 46)) << 1; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 29)) << 18; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 12)) << 35; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 47); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 42)) << 5; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 25)) << 22; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 8)) << 39; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 47); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 38)) << 9; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 21)) << 26; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 4)) << 43; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 47); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 34)) << 13; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 17)) << 30; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_47bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_47bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_47bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_47bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_47bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_47bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 47 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_47bw_16t(in, out, thread_idx); + _bit_unpack_64_47bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_48bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_48bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 0)) << 48; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 48); - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 32)) << 16; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 16)) << 32; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_48bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_48bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_48bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_48bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_48bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_48bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 48 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_48bw_16t(in, out, thread_idx); + _bit_unpack_64_48bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_49bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_49bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 49); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 34)) << 15; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 19)) << 30; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 4)) << 45; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 49); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 38)) << 11; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 23)) << 26; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 8)) << 41; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 49); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 42)) << 7; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 27)) << 22; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 12)) << 37; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 49); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 46)) << 3; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 31)) << 18; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 16)) << 33; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 1)) << 48; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 49); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 35)) << 14; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 20)) << 29; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 5)) << 44; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 49); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 39)) << 10; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 24)) << 25; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 9)) << 40; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 49); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 43)) << 6; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 28)) << 21; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 13)) << 36; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 49); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 47)) << 2; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 32)) << 17; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 17)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 2)) << 47; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 49); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 36)) << 13; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 21)) << 28; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 6)) << 43; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 49); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 40)) << 9; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 25)) << 24; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 10)) << 39; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 49); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 44)) << 5; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 29)) << 20; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 14)) << 35; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 49); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 48)) << 1; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 33)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 18)) << 31; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 3)) << 46; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 49); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 37)) << 12; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 22)) << 27; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 7)) << 42; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 49); - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 41)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 26)) << 23; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 11)) << 38; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 49); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 45)) << 4; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 30)) << 19; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 15)) << 34; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_49bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_49bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_49bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_49bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_49bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_49bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 49 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_49bw_16t(in, out, thread_idx); + _bit_unpack_64_49bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_50bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_50bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 50); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 36)) << 14; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 22)) << 28; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 8)) << 42; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 50); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 44)) << 6; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 30)) << 20; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 16)) << 34; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 2)) << 48; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 50); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 38)) << 12; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 24)) << 26; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 10)) << 40; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 50); - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 46)) << 4; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 32)) << 18; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 18)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 46; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 50); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 40)) << 10; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 26)) << 24; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 38; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 50); - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 48)) << 2; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 34)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 20)) << 30; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 44; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 50); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 42)) << 8; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 28)) << 22; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 14)) << 36; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 0)) << 50; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 50); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 36)) << 14; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 22)) << 28; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 42; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 50); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 44)) << 6; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 30)) << 20; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 16)) << 34; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 2)) << 48; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 50); - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 38)) << 12; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 24)) << 26; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 10)) << 40; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 50); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 46)) << 4; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 32)) << 18; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 18)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 4)) << 46; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 50); - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 40)) << 10; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 26)) << 24; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 12)) << 38; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 50); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 48)) << 2; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 34)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 20)) << 30; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 6)) << 44; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 50); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 42)) << 8; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 28)) << 22; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 14)) << 36; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_50bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_50bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_50bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_50bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_50bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_50bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 50 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_50bw_16t(in, out, thread_idx); + _bit_unpack_64_50bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_51bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_51bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 51); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 38)) << 13; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 25)) << 26; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 12)) << 39; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 51); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 50)) << 1; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 37)) << 14; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 24)) << 27; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 11)) << 40; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 51); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 49)) << 2; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 36)) << 15; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 23)) << 28; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 10)) << 41; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 51); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 48)) << 3; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 35)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 22)) << 29; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 9)) << 42; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 51); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 47)) << 4; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 34)) << 17; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 21)) << 30; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 8)) << 43; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 51); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 46)) << 5; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 33)) << 18; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 20)) << 31; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 7)) << 44; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 51); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 45)) << 6; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 32)) << 19; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 19)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 6)) << 45; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 51); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 44)) << 7; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 31)) << 20; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 18)) << 33; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 5)) << 46; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 51); - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 43)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 30)) << 21; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 17)) << 34; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 4)) << 47; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 51); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 42)) << 9; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 29)) << 22; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 16)) << 35; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 3)) << 48; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 51); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 41)) << 10; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 28)) << 23; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 15)) << 36; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 2)) << 49; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 51); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 40)) << 11; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 27)) << 24; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 14)) << 37; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 1)) << 50; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 51); - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 39)) << 12; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 26)) << 25; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 13)) << 38; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_51bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_51bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_51bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_51bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_51bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_51bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 51 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_51bw_16t(in, out, thread_idx); + _bit_unpack_64_51bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_52bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_52bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 52); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 40)) << 12; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 28)) << 24; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 16)) << 36; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 4)) << 48; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 52); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 44)) << 8; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 32)) << 20; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 20)) << 32; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 8)) << 44; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 52); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 48)) << 4; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 36)) << 16; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 24)) << 28; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 40; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 0)) << 52; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 52); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 40)) << 12; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 28)) << 24; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 16)) << 36; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 4)) << 48; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 52); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 44)) << 8; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 32)) << 20; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 20)) << 32; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 8)) << 44; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 52); - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 48)) << 4; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 36)) << 16; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 28; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 12)) << 40; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 0)) << 52; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 52); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 40)) << 12; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 28)) << 24; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 16)) << 36; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 4)) << 48; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 52); - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 44)) << 8; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 32)) << 20; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 20)) << 32; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 8)) << 44; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 52); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 48)) << 4; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 36)) << 16; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 24)) << 28; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 12)) << 40; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 0)) << 52; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 52); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 40)) << 12; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 28)) << 24; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 16)) << 36; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 4)) << 48; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 52); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 44)) << 8; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 32)) << 20; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 20)) << 32; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 8)) << 44; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 52); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 48)) << 4; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 36)) << 16; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 24)) << 28; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 12)) << 40; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_52bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_52bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_52bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_52bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_52bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_52bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 52 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_52bw_16t(in, out, thread_idx); + _bit_unpack_64_52bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_53bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_53bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 53); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 42)) << 11; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 31)) << 22; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 20)) << 33; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 9)) << 44; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 53); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 51)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 40)) << 13; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 29)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 18)) << 35; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 7)) << 46; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 53); - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 49)) << 4; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 38)) << 15; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 27)) << 26; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 16)) << 37; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 5)) << 48; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 53); - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 47)) << 6; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 36)) << 17; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 25)) << 28; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 14)) << 39; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 3)) << 50; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 53); - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 45)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 34)) << 19; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 23)) << 30; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 12)) << 41; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 1)) << 52; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 53); - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 43)) << 10; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 32)) << 21; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 21)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 10)) << 43; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 53); - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 52)) << 1; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 41)) << 12; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 30)) << 23; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 19)) << 34; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 8)) << 45; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 53); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 50)) << 3; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 39)) << 14; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 28)) << 25; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 17)) << 36; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 6)) << 47; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 53); - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 48)) << 5; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 37)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 26)) << 27; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 15)) << 38; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 4)) << 49; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 53); - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 46)) << 7; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 35)) << 18; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 24)) << 29; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 13)) << 40; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 2)) << 51; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 53); - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 44)) << 9; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 33)) << 20; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 22)) << 31; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 11)) << 42; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_53bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_53bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_53bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_53bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_53bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_53bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 53 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_53bw_16t(in, out, thread_idx); + _bit_unpack_64_53bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_54bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_54bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 54); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 44)) << 10; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 34)) << 20; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 24)) << 30; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 14)) << 40; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 4)) << 50; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 54); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 48)) << 6; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 38)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 28)) << 26; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 18)) << 36; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 8)) << 46; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 54); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 52)) << 2; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 42)) << 12; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 32)) << 22; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 22)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 12)) << 42; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 52; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 54); - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 46)) << 8; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 36)) << 18; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 26)) << 28; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 16)) << 38; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 6)) << 48; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 54); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 50)) << 4; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 40)) << 14; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 30)) << 24; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 34; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 10)) << 44; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 0)) << 54; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 54); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 44)) << 10; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 34)) << 20; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 24)) << 30; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 14)) << 40; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 4)) << 50; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 54); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 48)) << 6; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 38)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 28)) << 26; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 18)) << 36; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 8)) << 46; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 54); - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 52)) << 2; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 42)) << 12; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 32)) << 22; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 22)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 12)) << 42; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 2)) << 52; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 54); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 46)) << 8; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 36)) << 18; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 26)) << 28; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 16)) << 38; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 6)) << 48; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 54); - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 50)) << 4; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 40)) << 14; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 30)) << 24; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 20)) << 34; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 10)) << 44; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_54bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_54bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_54bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_54bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_54bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_54bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 54 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_54bw_16t(in, out, thread_idx); + _bit_unpack_64_54bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_55bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_55bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 55); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 46)) << 9; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 37)) << 18; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 28)) << 27; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 19)) << 36; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 10)) << 45; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 1)) << 54; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 55); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 47)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 38)) << 17; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 29)) << 26; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 35; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 11)) << 44; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 2)) << 53; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 55); - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 48)) << 7; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 39)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 30)) << 25; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 21)) << 34; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 12)) << 43; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 3)) << 52; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 55); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 49)) << 6; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 40)) << 15; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 31)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 22)) << 33; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 13)) << 42; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 4)) << 51; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 55); - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 50)) << 5; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 41)) << 14; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 32)) << 23; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 23)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 14)) << 41; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 5)) << 50; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 55); - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 51)) << 4; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 42)) << 13; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 33)) << 22; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 24)) << 31; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 15)) << 40; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 6)) << 49; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 55); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 52)) << 3; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 43)) << 12; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 34)) << 21; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 25)) << 30; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 16)) << 39; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 7)) << 48; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 55); - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 53)) << 2; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 44)) << 11; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 35)) << 20; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 26)) << 29; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 17)) << 38; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 8)) << 47; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 55); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 54)) << 1; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 45)) << 10; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 36)) << 19; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 27)) << 28; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 18)) << 37; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 9)) << 46; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 55); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_55bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_55bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_55bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_55bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_55bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_55bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 55 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_55bw_16t(in, out, thread_idx); + _bit_unpack_64_55bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_56bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_56bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 0)) << 56; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 56); - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 48)) << 8; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 40)) << 16; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 32)) << 24; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 24)) << 32; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 16)) << 40; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 8)) << 48; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_56bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_56bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_56bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_56bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_56bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_56bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 56 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_56bw_16t(in, out, thread_idx); + _bit_unpack_64_56bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_57bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_57bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 57); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 50)) << 7; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 43)) << 14; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 36)) << 21; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 29)) << 28; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 22)) << 35; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 15)) << 42; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 8)) << 49; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 1)) << 56; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 57); - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 51)) << 6; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 44)) << 13; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 37)) << 20; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 30)) << 27; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 23)) << 34; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 16)) << 41; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 9)) << 48; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 55); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 2)) << 55; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 57); - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 52)) << 5; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 45)) << 12; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 38)) << 19; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 31)) << 26; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 24)) << 33; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 17)) << 40; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 10)) << 47; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 3)) << 54; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 57); - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 53)) << 4; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 46)) << 11; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 39)) << 18; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 32)) << 25; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 25)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 18)) << 39; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 11)) << 46; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 4)) << 53; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 57); - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 54)) << 3; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 47)) << 10; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 40)) << 17; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 33)) << 24; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 26)) << 31; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 19)) << 38; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 12)) << 45; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 5)) << 52; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 57); - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 55)) << 2; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 48)) << 9; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 41)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 34)) << 23; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 27)) << 30; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 20)) << 37; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 13)) << 44; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 6)) << 51; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 57); - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 56)) << 1; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 49)) << 8; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 42)) << 15; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 35)) << 22; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 28)) << 29; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 21)) << 36; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 14)) << 43; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 7)) << 50; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 57); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_57bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_57bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_57bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_57bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_57bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_57bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 57 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_57bw_16t(in, out, thread_idx); + _bit_unpack_64_57bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_58bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_58bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 58); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 52)) << 6; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 46)) << 12; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 40)) << 18; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 34)) << 24; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 28)) << 30; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 22)) << 36; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 16)) << 42; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 10)) << 48; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 4)) << 54; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 58); - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 56)) << 2; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 50)) << 8; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 44)) << 14; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 38)) << 20; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 32)) << 26; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 26)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 20)) << 38; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 14)) << 44; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 8)) << 50; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 2)) << 56; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 58); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 54)) << 4; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 48)) << 10; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 42)) << 16; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 36)) << 22; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 30)) << 28; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 24)) << 34; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 18)) << 40; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 12)) << 46; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 6)) << 52; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 0)) << 58; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 58); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 52)) << 6; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 46)) << 12; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 40)) << 18; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 34)) << 24; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 28)) << 30; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 22)) << 36; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 16)) << 42; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 10)) << 48; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 4)) << 54; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 58); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 56)) << 2; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 50)) << 8; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 44)) << 14; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 38)) << 20; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 32)) << 26; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 26)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 20)) << 38; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 14)) << 44; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 8)) << 50; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 2)) << 56; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 58); - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 54)) << 4; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 48)) << 10; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 42)) << 16; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 36)) << 22; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 30)) << 28; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 24)) << 34; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 18)) << 40; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 12)) << 46; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 6)) << 52; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_58bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_58bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_58bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_58bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_58bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_58bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 58 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_58bw_16t(in, out, thread_idx); + _bit_unpack_64_58bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_59bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_59bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 59); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 54)) << 5; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 49)) << 10; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 44)) << 15; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 39)) << 20; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 34)) << 25; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 29)) << 30; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 24)) << 35; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 19)) << 40; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 14)) << 45; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 9)) << 50; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 55); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 4)) << 55; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 59); - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 58)) << 1; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 53)) << 6; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 48)) << 11; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 43)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 38)) << 21; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 33)) << 26; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 28)) << 31; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 23)) << 36; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 18)) << 41; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 13)) << 46; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 8)) << 51; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 3)) << 56; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 59); - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 57)) << 2; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 52)) << 7; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 47)) << 12; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 42)) << 17; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 37)) << 22; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 32)) << 27; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 27)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 22)) << 37; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 17)) << 42; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 12)) << 47; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 7)) << 52; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 57); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 2)) << 57; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 59); - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 56)) << 3; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 51)) << 8; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 46)) << 13; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 41)) << 18; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 36)) << 23; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 31)) << 28; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 26)) << 33; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 21)) << 38; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 16)) << 43; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 11)) << 48; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 6)) << 53; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 1)) << 58; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 59); - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 55)) << 4; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 50)) << 9; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 45)) << 14; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 40)) << 19; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 35)) << 24; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 30)) << 29; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 25)) << 34; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 20)) << 39; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 15)) << 44; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 10)) << 49; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 58]; tmp |= (src & MASK(uint64_t, 5)) << 54; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 59); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_59bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_59bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_59bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_59bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_59bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_59bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 59 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_59bw_16t(in, out, thread_idx); + _bit_unpack_64_59bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_60bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_60bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 60); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 56)) << 4; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 52)) << 8; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 48)) << 12; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 44)) << 16; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 40)) << 20; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 36)) << 24; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 32)) << 28; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 28)) << 32; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 24)) << 36; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 20)) << 40; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 16)) << 44; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 12)) << 48; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 8)) << 52; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 4)) << 56; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 0)) << 60; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 60); - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 56)) << 4; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 52)) << 8; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 48)) << 12; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 44)) << 16; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 40)) << 20; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 36)) << 24; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 32)) << 28; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 28)) << 32; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 24)) << 36; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 20)) << 40; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 16)) << 44; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 12)) << 48; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 8)) << 52; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 4)) << 56; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 0)) << 60; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 60); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 56)) << 4; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 52)) << 8; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 48)) << 12; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 44)) << 16; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 40)) << 20; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 36)) << 24; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 32)) << 28; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 28)) << 32; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 36; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 20)) << 40; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 16)) << 44; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 12)) << 48; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 8)) << 52; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 4)) << 56; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 0)) << 60; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 60); - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 56)) << 4; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 52)) << 8; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 48)) << 12; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 44)) << 16; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 40)) << 20; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 36)) << 24; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 32)) << 28; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 28)) << 32; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 24)) << 36; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 20)) << 40; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 16)) << 44; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 12)) << 48; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 58]; tmp |= (src & MASK(uint64_t, 8)) << 52; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 59]; tmp |= (src & MASK(uint64_t, 4)) << 56; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_60bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_60bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_60bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_60bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_60bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_60bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 60 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_60bw_16t(in, out, thread_idx); + _bit_unpack_64_60bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_61bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_61bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 61); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 58)) << 3; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 55)) << 6; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 52)) << 9; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 49)) << 12; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 46)) << 15; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 43)) << 18; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 40)) << 21; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 37)) << 24; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 34)) << 27; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 31)) << 30; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 28)) << 33; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 25)) << 36; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 22)) << 39; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 19)) << 42; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 16)) << 45; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 13)) << 48; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 10)) << 51; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 7)) << 54; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 57); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 4)) << 57; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 1)) << 60; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 61); - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 59)) << 2; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 56)) << 5; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 53)) << 8; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 50)) << 11; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 47)) << 14; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 44)) << 17; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 41)) << 20; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 38)) << 23; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 35)) << 26; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 32)) << 29; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 29)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 26)) << 35; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 23)) << 38; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 20)) << 41; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 17)) << 44; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 14)) << 47; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 11)) << 50; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 8)) << 53; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 5)) << 56; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 59); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 2)) << 59; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 61); - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 60)) << 1; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 57)) << 4; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 54)) << 7; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 51)) << 10; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 48)) << 13; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 45)) << 16; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 42)) << 19; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 39)) << 22; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 36)) << 25; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 33)) << 28; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 30)) << 31; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 27)) << 34; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 24)) << 37; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 21)) << 40; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 18)) << 43; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 15)) << 46; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 12)) << 49; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 58]; tmp |= (src & MASK(uint64_t, 9)) << 52; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 55); src = in[lane + LANE_COUNT * 59]; tmp |= (src & MASK(uint64_t, 6)) << 55; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 60]; tmp |= (src & MASK(uint64_t, 3)) << 58; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 61); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_61bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_61bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_61bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_61bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_61bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_61bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 61 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_61bw_16t(in, out, thread_idx); + _bit_unpack_64_61bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_62bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_62bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 62); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 60)) << 2; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 58)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 56)) << 6; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 54)) << 8; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 52)) << 10; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 50)) << 12; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 48)) << 14; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 46)) << 16; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 44)) << 18; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 42)) << 20; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 40)) << 22; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 38)) << 24; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 36)) << 26; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 34)) << 28; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 32)) << 30; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 30)) << 32; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 28)) << 34; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 26)) << 36; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 24)) << 38; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 22)) << 40; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 20)) << 42; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 18)) << 44; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 16)) << 46; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 14)) << 48; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 12)) << 50; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 10)) << 52; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 8)) << 54; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 6)) << 56; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 4)) << 58; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 2)) << 60; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 62); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 0)) << 62; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint64_t, 62); - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 60)) << 2; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 58)) << 4; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 56)) << 6; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 54)) << 8; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 52)) << 10; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 50)) << 12; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 48)) << 14; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 46)) << 16; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 44)) << 18; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 42)) << 20; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 40)) << 22; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 38)) << 24; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 36)) << 26; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 34)) << 28; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 32)) << 30; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 30)) << 32; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 28)) << 34; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 26)) << 36; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 24)) << 38; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 22)) << 40; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 20)) << 42; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 18)) << 44; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 16)) << 46; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 14)) << 48; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 12)) << 50; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 10)) << 52; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 58]; tmp |= (src & MASK(uint64_t, 8)) << 54; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 59]; tmp |= (src & MASK(uint64_t, 6)) << 56; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 60]; tmp |= (src & MASK(uint64_t, 4)) << 58; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 61]; tmp |= (src & MASK(uint64_t, 2)) << 60; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 62); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_62bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_62bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_62bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_62bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_62bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_62bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 62 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_62bw_16t(in, out, thread_idx); + _bit_unpack_64_62bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_63bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_63bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; uint64_t src; uint64_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint64_t, 63); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 63) & MASK(uint64_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint64_t, 62)) << 1; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 62) & MASK(uint64_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint64_t, 61)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 61) & MASK(uint64_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint64_t, 60)) << 3; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 60) & MASK(uint64_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint64_t, 59)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 59) & MASK(uint64_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint64_t, 58)) << 5; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 58) & MASK(uint64_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint64_t, 57)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 57) & MASK(uint64_t, 7); src = in[lane + LANE_COUNT * 7]; tmp |= (src & MASK(uint64_t, 56)) << 7; - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; tmp = (src >> 56) & MASK(uint64_t, 8); src = in[lane + LANE_COUNT * 8]; tmp |= (src & MASK(uint64_t, 55)) << 8; - out[INDEX(8, lane)] = tmp; + out[INDEX(8, lane)] = tmp + reference; tmp = (src >> 55) & MASK(uint64_t, 9); src = in[lane + LANE_COUNT * 9]; tmp |= (src & MASK(uint64_t, 54)) << 9; - out[INDEX(9, lane)] = tmp; + out[INDEX(9, lane)] = tmp + reference; tmp = (src >> 54) & MASK(uint64_t, 10); src = in[lane + LANE_COUNT * 10]; tmp |= (src & MASK(uint64_t, 53)) << 10; - out[INDEX(10, lane)] = tmp; + out[INDEX(10, lane)] = tmp + reference; tmp = (src >> 53) & MASK(uint64_t, 11); src = in[lane + LANE_COUNT * 11]; tmp |= (src & MASK(uint64_t, 52)) << 11; - out[INDEX(11, lane)] = tmp; + out[INDEX(11, lane)] = tmp + reference; tmp = (src >> 52) & MASK(uint64_t, 12); src = in[lane + LANE_COUNT * 12]; tmp |= (src & MASK(uint64_t, 51)) << 12; - out[INDEX(12, lane)] = tmp; + out[INDEX(12, lane)] = tmp + reference; tmp = (src >> 51) & MASK(uint64_t, 13); src = in[lane + LANE_COUNT * 13]; tmp |= (src & MASK(uint64_t, 50)) << 13; - out[INDEX(13, lane)] = tmp; + out[INDEX(13, lane)] = tmp + reference; tmp = (src >> 50) & MASK(uint64_t, 14); src = in[lane + LANE_COUNT * 14]; tmp |= (src & MASK(uint64_t, 49)) << 14; - out[INDEX(14, lane)] = tmp; + out[INDEX(14, lane)] = tmp + reference; tmp = (src >> 49) & MASK(uint64_t, 15); src = in[lane + LANE_COUNT * 15]; tmp |= (src & MASK(uint64_t, 48)) << 15; - out[INDEX(15, lane)] = tmp; + out[INDEX(15, lane)] = tmp + reference; tmp = (src >> 48) & MASK(uint64_t, 16); src = in[lane + LANE_COUNT * 16]; tmp |= (src & MASK(uint64_t, 47)) << 16; - out[INDEX(16, lane)] = tmp; + out[INDEX(16, lane)] = tmp + reference; tmp = (src >> 47) & MASK(uint64_t, 17); src = in[lane + LANE_COUNT * 17]; tmp |= (src & MASK(uint64_t, 46)) << 17; - out[INDEX(17, lane)] = tmp; + out[INDEX(17, lane)] = tmp + reference; tmp = (src >> 46) & MASK(uint64_t, 18); src = in[lane + LANE_COUNT * 18]; tmp |= (src & MASK(uint64_t, 45)) << 18; - out[INDEX(18, lane)] = tmp; + out[INDEX(18, lane)] = tmp + reference; tmp = (src >> 45) & MASK(uint64_t, 19); src = in[lane + LANE_COUNT * 19]; tmp |= (src & MASK(uint64_t, 44)) << 19; - out[INDEX(19, lane)] = tmp; + out[INDEX(19, lane)] = tmp + reference; tmp = (src >> 44) & MASK(uint64_t, 20); src = in[lane + LANE_COUNT * 20]; tmp |= (src & MASK(uint64_t, 43)) << 20; - out[INDEX(20, lane)] = tmp; + out[INDEX(20, lane)] = tmp + reference; tmp = (src >> 43) & MASK(uint64_t, 21); src = in[lane + LANE_COUNT * 21]; tmp |= (src & MASK(uint64_t, 42)) << 21; - out[INDEX(21, lane)] = tmp; + out[INDEX(21, lane)] = tmp + reference; tmp = (src >> 42) & MASK(uint64_t, 22); src = in[lane + LANE_COUNT * 22]; tmp |= (src & MASK(uint64_t, 41)) << 22; - out[INDEX(22, lane)] = tmp; + out[INDEX(22, lane)] = tmp + reference; tmp = (src >> 41) & MASK(uint64_t, 23); src = in[lane + LANE_COUNT * 23]; tmp |= (src & MASK(uint64_t, 40)) << 23; - out[INDEX(23, lane)] = tmp; + out[INDEX(23, lane)] = tmp + reference; tmp = (src >> 40) & MASK(uint64_t, 24); src = in[lane + LANE_COUNT * 24]; tmp |= (src & MASK(uint64_t, 39)) << 24; - out[INDEX(24, lane)] = tmp; + out[INDEX(24, lane)] = tmp + reference; tmp = (src >> 39) & MASK(uint64_t, 25); src = in[lane + LANE_COUNT * 25]; tmp |= (src & MASK(uint64_t, 38)) << 25; - out[INDEX(25, lane)] = tmp; + out[INDEX(25, lane)] = tmp + reference; tmp = (src >> 38) & MASK(uint64_t, 26); src = in[lane + LANE_COUNT * 26]; tmp |= (src & MASK(uint64_t, 37)) << 26; - out[INDEX(26, lane)] = tmp; + out[INDEX(26, lane)] = tmp + reference; tmp = (src >> 37) & MASK(uint64_t, 27); src = in[lane + LANE_COUNT * 27]; tmp |= (src & MASK(uint64_t, 36)) << 27; - out[INDEX(27, lane)] = tmp; + out[INDEX(27, lane)] = tmp + reference; tmp = (src >> 36) & MASK(uint64_t, 28); src = in[lane + LANE_COUNT * 28]; tmp |= (src & MASK(uint64_t, 35)) << 28; - out[INDEX(28, lane)] = tmp; + out[INDEX(28, lane)] = tmp + reference; tmp = (src >> 35) & MASK(uint64_t, 29); src = in[lane + LANE_COUNT * 29]; tmp |= (src & MASK(uint64_t, 34)) << 29; - out[INDEX(29, lane)] = tmp; + out[INDEX(29, lane)] = tmp + reference; tmp = (src >> 34) & MASK(uint64_t, 30); src = in[lane + LANE_COUNT * 30]; tmp |= (src & MASK(uint64_t, 33)) << 30; - out[INDEX(30, lane)] = tmp; + out[INDEX(30, lane)] = tmp + reference; tmp = (src >> 33) & MASK(uint64_t, 31); src = in[lane + LANE_COUNT * 31]; tmp |= (src & MASK(uint64_t, 32)) << 31; - out[INDEX(31, lane)] = tmp; + out[INDEX(31, lane)] = tmp + reference; tmp = (src >> 32) & MASK(uint64_t, 32); src = in[lane + LANE_COUNT * 32]; tmp |= (src & MASK(uint64_t, 31)) << 32; - out[INDEX(32, lane)] = tmp; + out[INDEX(32, lane)] = tmp + reference; tmp = (src >> 31) & MASK(uint64_t, 33); src = in[lane + LANE_COUNT * 33]; tmp |= (src & MASK(uint64_t, 30)) << 33; - out[INDEX(33, lane)] = tmp; + out[INDEX(33, lane)] = tmp + reference; tmp = (src >> 30) & MASK(uint64_t, 34); src = in[lane + LANE_COUNT * 34]; tmp |= (src & MASK(uint64_t, 29)) << 34; - out[INDEX(34, lane)] = tmp; + out[INDEX(34, lane)] = tmp + reference; tmp = (src >> 29) & MASK(uint64_t, 35); src = in[lane + LANE_COUNT * 35]; tmp |= (src & MASK(uint64_t, 28)) << 35; - out[INDEX(35, lane)] = tmp; + out[INDEX(35, lane)] = tmp + reference; tmp = (src >> 28) & MASK(uint64_t, 36); src = in[lane + LANE_COUNT * 36]; tmp |= (src & MASK(uint64_t, 27)) << 36; - out[INDEX(36, lane)] = tmp; + out[INDEX(36, lane)] = tmp + reference; tmp = (src >> 27) & MASK(uint64_t, 37); src = in[lane + LANE_COUNT * 37]; tmp |= (src & MASK(uint64_t, 26)) << 37; - out[INDEX(37, lane)] = tmp; + out[INDEX(37, lane)] = tmp + reference; tmp = (src >> 26) & MASK(uint64_t, 38); src = in[lane + LANE_COUNT * 38]; tmp |= (src & MASK(uint64_t, 25)) << 38; - out[INDEX(38, lane)] = tmp; + out[INDEX(38, lane)] = tmp + reference; tmp = (src >> 25) & MASK(uint64_t, 39); src = in[lane + LANE_COUNT * 39]; tmp |= (src & MASK(uint64_t, 24)) << 39; - out[INDEX(39, lane)] = tmp; + out[INDEX(39, lane)] = tmp + reference; tmp = (src >> 24) & MASK(uint64_t, 40); src = in[lane + LANE_COUNT * 40]; tmp |= (src & MASK(uint64_t, 23)) << 40; - out[INDEX(40, lane)] = tmp; + out[INDEX(40, lane)] = tmp + reference; tmp = (src >> 23) & MASK(uint64_t, 41); src = in[lane + LANE_COUNT * 41]; tmp |= (src & MASK(uint64_t, 22)) << 41; - out[INDEX(41, lane)] = tmp; + out[INDEX(41, lane)] = tmp + reference; tmp = (src >> 22) & MASK(uint64_t, 42); src = in[lane + LANE_COUNT * 42]; tmp |= (src & MASK(uint64_t, 21)) << 42; - out[INDEX(42, lane)] = tmp; + out[INDEX(42, lane)] = tmp + reference; tmp = (src >> 21) & MASK(uint64_t, 43); src = in[lane + LANE_COUNT * 43]; tmp |= (src & MASK(uint64_t, 20)) << 43; - out[INDEX(43, lane)] = tmp; + out[INDEX(43, lane)] = tmp + reference; tmp = (src >> 20) & MASK(uint64_t, 44); src = in[lane + LANE_COUNT * 44]; tmp |= (src & MASK(uint64_t, 19)) << 44; - out[INDEX(44, lane)] = tmp; + out[INDEX(44, lane)] = tmp + reference; tmp = (src >> 19) & MASK(uint64_t, 45); src = in[lane + LANE_COUNT * 45]; tmp |= (src & MASK(uint64_t, 18)) << 45; - out[INDEX(45, lane)] = tmp; + out[INDEX(45, lane)] = tmp + reference; tmp = (src >> 18) & MASK(uint64_t, 46); src = in[lane + LANE_COUNT * 46]; tmp |= (src & MASK(uint64_t, 17)) << 46; - out[INDEX(46, lane)] = tmp; + out[INDEX(46, lane)] = tmp + reference; tmp = (src >> 17) & MASK(uint64_t, 47); src = in[lane + LANE_COUNT * 47]; tmp |= (src & MASK(uint64_t, 16)) << 47; - out[INDEX(47, lane)] = tmp; + out[INDEX(47, lane)] = tmp + reference; tmp = (src >> 16) & MASK(uint64_t, 48); src = in[lane + LANE_COUNT * 48]; tmp |= (src & MASK(uint64_t, 15)) << 48; - out[INDEX(48, lane)] = tmp; + out[INDEX(48, lane)] = tmp + reference; tmp = (src >> 15) & MASK(uint64_t, 49); src = in[lane + LANE_COUNT * 49]; tmp |= (src & MASK(uint64_t, 14)) << 49; - out[INDEX(49, lane)] = tmp; + out[INDEX(49, lane)] = tmp + reference; tmp = (src >> 14) & MASK(uint64_t, 50); src = in[lane + LANE_COUNT * 50]; tmp |= (src & MASK(uint64_t, 13)) << 50; - out[INDEX(50, lane)] = tmp; + out[INDEX(50, lane)] = tmp + reference; tmp = (src >> 13) & MASK(uint64_t, 51); src = in[lane + LANE_COUNT * 51]; tmp |= (src & MASK(uint64_t, 12)) << 51; - out[INDEX(51, lane)] = tmp; + out[INDEX(51, lane)] = tmp + reference; tmp = (src >> 12) & MASK(uint64_t, 52); src = in[lane + LANE_COUNT * 52]; tmp |= (src & MASK(uint64_t, 11)) << 52; - out[INDEX(52, lane)] = tmp; + out[INDEX(52, lane)] = tmp + reference; tmp = (src >> 11) & MASK(uint64_t, 53); src = in[lane + LANE_COUNT * 53]; tmp |= (src & MASK(uint64_t, 10)) << 53; - out[INDEX(53, lane)] = tmp; + out[INDEX(53, lane)] = tmp + reference; tmp = (src >> 10) & MASK(uint64_t, 54); src = in[lane + LANE_COUNT * 54]; tmp |= (src & MASK(uint64_t, 9)) << 54; - out[INDEX(54, lane)] = tmp; + out[INDEX(54, lane)] = tmp + reference; tmp = (src >> 9) & MASK(uint64_t, 55); src = in[lane + LANE_COUNT * 55]; tmp |= (src & MASK(uint64_t, 8)) << 55; - out[INDEX(55, lane)] = tmp; + out[INDEX(55, lane)] = tmp + reference; tmp = (src >> 8) & MASK(uint64_t, 56); src = in[lane + LANE_COUNT * 56]; tmp |= (src & MASK(uint64_t, 7)) << 56; - out[INDEX(56, lane)] = tmp; + out[INDEX(56, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint64_t, 57); src = in[lane + LANE_COUNT * 57]; tmp |= (src & MASK(uint64_t, 6)) << 57; - out[INDEX(57, lane)] = tmp; + out[INDEX(57, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint64_t, 58); src = in[lane + LANE_COUNT * 58]; tmp |= (src & MASK(uint64_t, 5)) << 58; - out[INDEX(58, lane)] = tmp; + out[INDEX(58, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint64_t, 59); src = in[lane + LANE_COUNT * 59]; tmp |= (src & MASK(uint64_t, 4)) << 59; - out[INDEX(59, lane)] = tmp; + out[INDEX(59, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint64_t, 60); src = in[lane + LANE_COUNT * 60]; tmp |= (src & MASK(uint64_t, 3)) << 60; - out[INDEX(60, lane)] = tmp; + out[INDEX(60, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint64_t, 61); src = in[lane + LANE_COUNT * 61]; tmp |= (src & MASK(uint64_t, 2)) << 61; - out[INDEX(61, lane)] = tmp; + out[INDEX(61, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint64_t, 62); src = in[lane + LANE_COUNT * 62]; tmp |= (src & MASK(uint64_t, 1)) << 62; - out[INDEX(62, lane)] = tmp; + out[INDEX(62, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint64_t, 63); - out[INDEX(63, lane)] = tmp; + out[INDEX(63, lane)] = tmp + reference; } -__device__ void _bit_unpack_64_63bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_63bw_lane(in, out, thread_idx * 1 + 0); +__device__ void _bit_unpack_64_63bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_63bw_lane(in, out, reference, thread_idx * 1 + 0); } -extern "C" __global__ void bit_unpack_64_63bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_64_63bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 63 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_63bw_16t(in, out, thread_idx); + _bit_unpack_64_63bw_16t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_64_64bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_64_64bw_lane(const uint64_t *__restrict in, uint64_t *__restrict out, const uint64_t reference, unsigned int lane) { unsigned int LANE_COUNT = 16; - out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane]; - out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane]; - out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane]; - out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane]; - out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane]; - out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane]; - out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane]; - out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane]; - out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane]; - out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane]; - out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane]; - out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane]; - out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane]; - out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane]; - out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane]; - out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane]; - out[INDEX(16, lane)] = in[LANE_COUNT * 16 + lane]; - out[INDEX(17, lane)] = in[LANE_COUNT * 17 + lane]; - out[INDEX(18, lane)] = in[LANE_COUNT * 18 + lane]; - out[INDEX(19, lane)] = in[LANE_COUNT * 19 + lane]; - out[INDEX(20, lane)] = in[LANE_COUNT * 20 + lane]; - out[INDEX(21, lane)] = in[LANE_COUNT * 21 + lane]; - out[INDEX(22, lane)] = in[LANE_COUNT * 22 + lane]; - out[INDEX(23, lane)] = in[LANE_COUNT * 23 + lane]; - out[INDEX(24, lane)] = in[LANE_COUNT * 24 + lane]; - out[INDEX(25, lane)] = in[LANE_COUNT * 25 + lane]; - out[INDEX(26, lane)] = in[LANE_COUNT * 26 + lane]; - out[INDEX(27, lane)] = in[LANE_COUNT * 27 + lane]; - out[INDEX(28, lane)] = in[LANE_COUNT * 28 + lane]; - out[INDEX(29, lane)] = in[LANE_COUNT * 29 + lane]; - out[INDEX(30, lane)] = in[LANE_COUNT * 30 + lane]; - out[INDEX(31, lane)] = in[LANE_COUNT * 31 + lane]; - out[INDEX(32, lane)] = in[LANE_COUNT * 32 + lane]; - out[INDEX(33, lane)] = in[LANE_COUNT * 33 + lane]; - out[INDEX(34, lane)] = in[LANE_COUNT * 34 + lane]; - out[INDEX(35, lane)] = in[LANE_COUNT * 35 + lane]; - out[INDEX(36, lane)] = in[LANE_COUNT * 36 + lane]; - out[INDEX(37, lane)] = in[LANE_COUNT * 37 + lane]; - out[INDEX(38, lane)] = in[LANE_COUNT * 38 + lane]; - out[INDEX(39, lane)] = in[LANE_COUNT * 39 + lane]; - out[INDEX(40, lane)] = in[LANE_COUNT * 40 + lane]; - out[INDEX(41, lane)] = in[LANE_COUNT * 41 + lane]; - out[INDEX(42, lane)] = in[LANE_COUNT * 42 + lane]; - out[INDEX(43, lane)] = in[LANE_COUNT * 43 + lane]; - out[INDEX(44, lane)] = in[LANE_COUNT * 44 + lane]; - out[INDEX(45, lane)] = in[LANE_COUNT * 45 + lane]; - out[INDEX(46, lane)] = in[LANE_COUNT * 46 + lane]; - out[INDEX(47, lane)] = in[LANE_COUNT * 47 + lane]; - out[INDEX(48, lane)] = in[LANE_COUNT * 48 + lane]; - out[INDEX(49, lane)] = in[LANE_COUNT * 49 + lane]; - out[INDEX(50, lane)] = in[LANE_COUNT * 50 + lane]; - out[INDEX(51, lane)] = in[LANE_COUNT * 51 + lane]; - out[INDEX(52, lane)] = in[LANE_COUNT * 52 + lane]; - out[INDEX(53, lane)] = in[LANE_COUNT * 53 + lane]; - out[INDEX(54, lane)] = in[LANE_COUNT * 54 + lane]; - out[INDEX(55, lane)] = in[LANE_COUNT * 55 + lane]; - out[INDEX(56, lane)] = in[LANE_COUNT * 56 + lane]; - out[INDEX(57, lane)] = in[LANE_COUNT * 57 + lane]; - out[INDEX(58, lane)] = in[LANE_COUNT * 58 + lane]; - out[INDEX(59, lane)] = in[LANE_COUNT * 59 + lane]; - out[INDEX(60, lane)] = in[LANE_COUNT * 60 + lane]; - out[INDEX(61, lane)] = in[LANE_COUNT * 61 + lane]; - out[INDEX(62, lane)] = in[LANE_COUNT * 62 + lane]; - out[INDEX(63, lane)] = in[LANE_COUNT * 63 + lane]; -} - -__device__ void _bit_unpack_64_64bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, int thread_idx) { - _bit_unpack_64_64bw_lane(in, out, thread_idx * 1 + 0); -} - -extern "C" __global__ void bit_unpack_64_64bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out) { + out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane] + reference; + out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane] + reference; + out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane] + reference; + out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane] + reference; + out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane] + reference; + out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane] + reference; + out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane] + reference; + out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane] + reference; + out[INDEX(8, lane)] = in[LANE_COUNT * 8 + lane] + reference; + out[INDEX(9, lane)] = in[LANE_COUNT * 9 + lane] + reference; + out[INDEX(10, lane)] = in[LANE_COUNT * 10 + lane] + reference; + out[INDEX(11, lane)] = in[LANE_COUNT * 11 + lane] + reference; + out[INDEX(12, lane)] = in[LANE_COUNT * 12 + lane] + reference; + out[INDEX(13, lane)] = in[LANE_COUNT * 13 + lane] + reference; + out[INDEX(14, lane)] = in[LANE_COUNT * 14 + lane] + reference; + out[INDEX(15, lane)] = in[LANE_COUNT * 15 + lane] + reference; + out[INDEX(16, lane)] = in[LANE_COUNT * 16 + lane] + reference; + out[INDEX(17, lane)] = in[LANE_COUNT * 17 + lane] + reference; + out[INDEX(18, lane)] = in[LANE_COUNT * 18 + lane] + reference; + out[INDEX(19, lane)] = in[LANE_COUNT * 19 + lane] + reference; + out[INDEX(20, lane)] = in[LANE_COUNT * 20 + lane] + reference; + out[INDEX(21, lane)] = in[LANE_COUNT * 21 + lane] + reference; + out[INDEX(22, lane)] = in[LANE_COUNT * 22 + lane] + reference; + out[INDEX(23, lane)] = in[LANE_COUNT * 23 + lane] + reference; + out[INDEX(24, lane)] = in[LANE_COUNT * 24 + lane] + reference; + out[INDEX(25, lane)] = in[LANE_COUNT * 25 + lane] + reference; + out[INDEX(26, lane)] = in[LANE_COUNT * 26 + lane] + reference; + out[INDEX(27, lane)] = in[LANE_COUNT * 27 + lane] + reference; + out[INDEX(28, lane)] = in[LANE_COUNT * 28 + lane] + reference; + out[INDEX(29, lane)] = in[LANE_COUNT * 29 + lane] + reference; + out[INDEX(30, lane)] = in[LANE_COUNT * 30 + lane] + reference; + out[INDEX(31, lane)] = in[LANE_COUNT * 31 + lane] + reference; + out[INDEX(32, lane)] = in[LANE_COUNT * 32 + lane] + reference; + out[INDEX(33, lane)] = in[LANE_COUNT * 33 + lane] + reference; + out[INDEX(34, lane)] = in[LANE_COUNT * 34 + lane] + reference; + out[INDEX(35, lane)] = in[LANE_COUNT * 35 + lane] + reference; + out[INDEX(36, lane)] = in[LANE_COUNT * 36 + lane] + reference; + out[INDEX(37, lane)] = in[LANE_COUNT * 37 + lane] + reference; + out[INDEX(38, lane)] = in[LANE_COUNT * 38 + lane] + reference; + out[INDEX(39, lane)] = in[LANE_COUNT * 39 + lane] + reference; + out[INDEX(40, lane)] = in[LANE_COUNT * 40 + lane] + reference; + out[INDEX(41, lane)] = in[LANE_COUNT * 41 + lane] + reference; + out[INDEX(42, lane)] = in[LANE_COUNT * 42 + lane] + reference; + out[INDEX(43, lane)] = in[LANE_COUNT * 43 + lane] + reference; + out[INDEX(44, lane)] = in[LANE_COUNT * 44 + lane] + reference; + out[INDEX(45, lane)] = in[LANE_COUNT * 45 + lane] + reference; + out[INDEX(46, lane)] = in[LANE_COUNT * 46 + lane] + reference; + out[INDEX(47, lane)] = in[LANE_COUNT * 47 + lane] + reference; + out[INDEX(48, lane)] = in[LANE_COUNT * 48 + lane] + reference; + out[INDEX(49, lane)] = in[LANE_COUNT * 49 + lane] + reference; + out[INDEX(50, lane)] = in[LANE_COUNT * 50 + lane] + reference; + out[INDEX(51, lane)] = in[LANE_COUNT * 51 + lane] + reference; + out[INDEX(52, lane)] = in[LANE_COUNT * 52 + lane] + reference; + out[INDEX(53, lane)] = in[LANE_COUNT * 53 + lane] + reference; + out[INDEX(54, lane)] = in[LANE_COUNT * 54 + lane] + reference; + out[INDEX(55, lane)] = in[LANE_COUNT * 55 + lane] + reference; + out[INDEX(56, lane)] = in[LANE_COUNT * 56 + lane] + reference; + out[INDEX(57, lane)] = in[LANE_COUNT * 57 + lane] + reference; + out[INDEX(58, lane)] = in[LANE_COUNT * 58 + lane] + reference; + out[INDEX(59, lane)] = in[LANE_COUNT * 59 + lane] + reference; + out[INDEX(60, lane)] = in[LANE_COUNT * 60 + lane] + reference; + out[INDEX(61, lane)] = in[LANE_COUNT * 61 + lane] + reference; + out[INDEX(62, lane)] = in[LANE_COUNT * 62 + lane] + reference; + out[INDEX(63, lane)] = in[LANE_COUNT * 63 + lane] + reference; +} + +__device__ void _bit_unpack_64_64bw_16t(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx) { + _bit_unpack_64_64bw_lane(in, out, reference, thread_idx * 1 + 0); +} + +extern "C" __global__ void bit_unpack_64_64bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 64 / sizeof(uint64_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_64_64bw_16t(in, out, thread_idx); + _bit_unpack_64_64bw_16t(in, out, reference, thread_idx); } diff --git a/vortex-cuda/kernels/src/bit_unpack_8.cu b/vortex-cuda/kernels/src/bit_unpack_8.cu index 362b22c006e..fc541c65bff 100644 --- a/vortex-cuda/kernels/src/bit_unpack_8.cu +++ b/vortex-cuda/kernels/src/bit_unpack_8.cu @@ -4,366 +4,365 @@ #include #include "fastlanes_common.cuh" -__device__ void _bit_unpack_8_0bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_0bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; - uint8_t zero = 0ULL; - out[INDEX(0, lane)] = zero; - out[INDEX(1, lane)] = zero; - out[INDEX(2, lane)] = zero; - out[INDEX(3, lane)] = zero; - out[INDEX(4, lane)] = zero; - out[INDEX(5, lane)] = zero; - out[INDEX(6, lane)] = zero; - out[INDEX(7, lane)] = zero; + out[INDEX(0, lane)] = reference; + out[INDEX(1, lane)] = reference; + out[INDEX(2, lane)] = reference; + out[INDEX(3, lane)] = reference; + out[INDEX(4, lane)] = reference; + out[INDEX(5, lane)] = reference; + out[INDEX(6, lane)] = reference; + out[INDEX(7, lane)] = reference; } -__device__ void _bit_unpack_8_0bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { - _bit_unpack_8_0bw_lane(in, out, thread_idx * 4 + 0); - _bit_unpack_8_0bw_lane(in, out, thread_idx * 4 + 1); - _bit_unpack_8_0bw_lane(in, out, thread_idx * 4 + 2); - _bit_unpack_8_0bw_lane(in, out, thread_idx * 4 + 3); +__device__ void _bit_unpack_8_0bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx) { + _bit_unpack_8_0bw_lane(in, out, reference, thread_idx * 4 + 0); + _bit_unpack_8_0bw_lane(in, out, reference, thread_idx * 4 + 1); + _bit_unpack_8_0bw_lane(in, out, reference, thread_idx * 4 + 2); + _bit_unpack_8_0bw_lane(in, out, reference, thread_idx * 4 + 3); } -extern "C" __global__ void bit_unpack_8_0bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_0bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_0bw_32t(in, out, thread_idx); + _bit_unpack_8_0bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_1bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_1bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 1); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint8_t, 1); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 1); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint8_t, 1); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 1); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint8_t, 1); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 1); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint8_t, 1); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_1bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { - _bit_unpack_8_1bw_lane(in, out, thread_idx * 4 + 0); - _bit_unpack_8_1bw_lane(in, out, thread_idx * 4 + 1); - _bit_unpack_8_1bw_lane(in, out, thread_idx * 4 + 2); - _bit_unpack_8_1bw_lane(in, out, thread_idx * 4 + 3); +__device__ void _bit_unpack_8_1bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx) { + _bit_unpack_8_1bw_lane(in, out, reference, thread_idx * 4 + 0); + _bit_unpack_8_1bw_lane(in, out, reference, thread_idx * 4 + 1); + _bit_unpack_8_1bw_lane(in, out, reference, thread_idx * 4 + 2); + _bit_unpack_8_1bw_lane(in, out, reference, thread_idx * 4 + 3); } -extern "C" __global__ void bit_unpack_8_1bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_1bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_1bw_32t(in, out, thread_idx); + _bit_unpack_8_1bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_2bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_2bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 2); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 2); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 2); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 0)) << 2; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint8_t, 2); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 2); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 2); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_2bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { - _bit_unpack_8_2bw_lane(in, out, thread_idx * 4 + 0); - _bit_unpack_8_2bw_lane(in, out, thread_idx * 4 + 1); - _bit_unpack_8_2bw_lane(in, out, thread_idx * 4 + 2); - _bit_unpack_8_2bw_lane(in, out, thread_idx * 4 + 3); +__device__ void _bit_unpack_8_2bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx) { + _bit_unpack_8_2bw_lane(in, out, reference, thread_idx * 4 + 0); + _bit_unpack_8_2bw_lane(in, out, reference, thread_idx * 4 + 1); + _bit_unpack_8_2bw_lane(in, out, reference, thread_idx * 4 + 2); + _bit_unpack_8_2bw_lane(in, out, reference, thread_idx * 4 + 3); } -extern "C" __global__ void bit_unpack_8_2bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_2bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_2bw_32t(in, out, thread_idx); + _bit_unpack_8_2bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_3bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_3bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 3); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint8_t, 3); - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 1)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint8_t, 3); - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 3); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint8_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint8_t, 2)) << 1; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 3); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint8_t, 3); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_3bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { - _bit_unpack_8_3bw_lane(in, out, thread_idx * 4 + 0); - _bit_unpack_8_3bw_lane(in, out, thread_idx * 4 + 1); - _bit_unpack_8_3bw_lane(in, out, thread_idx * 4 + 2); - _bit_unpack_8_3bw_lane(in, out, thread_idx * 4 + 3); +__device__ void _bit_unpack_8_3bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx) { + _bit_unpack_8_3bw_lane(in, out, reference, thread_idx * 4 + 0); + _bit_unpack_8_3bw_lane(in, out, reference, thread_idx * 4 + 1); + _bit_unpack_8_3bw_lane(in, out, reference, thread_idx * 4 + 2); + _bit_unpack_8_3bw_lane(in, out, reference, thread_idx * 4 + 3); } -extern "C" __global__ void bit_unpack_8_3bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_3bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_3bw_32t(in, out, thread_idx); + _bit_unpack_8_3bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_4bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_4bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 4); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 0)) << 4; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint8_t, 4); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint8_t, 0)) << 4; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint8_t, 4); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint8_t, 0)) << 4; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint8_t, 4); - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_4bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { - _bit_unpack_8_4bw_lane(in, out, thread_idx * 4 + 0); - _bit_unpack_8_4bw_lane(in, out, thread_idx * 4 + 1); - _bit_unpack_8_4bw_lane(in, out, thread_idx * 4 + 2); - _bit_unpack_8_4bw_lane(in, out, thread_idx * 4 + 3); +__device__ void _bit_unpack_8_4bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx) { + _bit_unpack_8_4bw_lane(in, out, reference, thread_idx * 4 + 0); + _bit_unpack_8_4bw_lane(in, out, reference, thread_idx * 4 + 1); + _bit_unpack_8_4bw_lane(in, out, reference, thread_idx * 4 + 2); + _bit_unpack_8_4bw_lane(in, out, reference, thread_idx * 4 + 3); } -extern "C" __global__ void bit_unpack_8_4bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_4bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_4bw_32t(in, out, thread_idx); + _bit_unpack_8_4bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_5bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_5bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 5); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint8_t, 3); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 2)) << 3; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 5); - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint8_t, 1); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint8_t, 4)) << 1; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint8_t, 1)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint8_t, 5); - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint8_t, 3)) << 2; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint8_t, 5); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_5bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { - _bit_unpack_8_5bw_lane(in, out, thread_idx * 4 + 0); - _bit_unpack_8_5bw_lane(in, out, thread_idx * 4 + 1); - _bit_unpack_8_5bw_lane(in, out, thread_idx * 4 + 2); - _bit_unpack_8_5bw_lane(in, out, thread_idx * 4 + 3); +__device__ void _bit_unpack_8_5bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx) { + _bit_unpack_8_5bw_lane(in, out, reference, thread_idx * 4 + 0); + _bit_unpack_8_5bw_lane(in, out, reference, thread_idx * 4 + 1); + _bit_unpack_8_5bw_lane(in, out, reference, thread_idx * 4 + 2); + _bit_unpack_8_5bw_lane(in, out, reference, thread_idx * 4 + 3); } -extern "C" __global__ void bit_unpack_8_5bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_5bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_5bw_32t(in, out, thread_idx); + _bit_unpack_8_5bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_6bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_6bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 6); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 4)) << 2; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint8_t, 2)) << 4; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 6); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint8_t, 0)) << 6; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 0) & MASK(uint8_t, 6); - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint8_t, 4)) << 2; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint8_t, 2)) << 4; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 6); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_6bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { - _bit_unpack_8_6bw_lane(in, out, thread_idx * 4 + 0); - _bit_unpack_8_6bw_lane(in, out, thread_idx * 4 + 1); - _bit_unpack_8_6bw_lane(in, out, thread_idx * 4 + 2); - _bit_unpack_8_6bw_lane(in, out, thread_idx * 4 + 3); +__device__ void _bit_unpack_8_6bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx) { + _bit_unpack_8_6bw_lane(in, out, reference, thread_idx * 4 + 0); + _bit_unpack_8_6bw_lane(in, out, reference, thread_idx * 4 + 1); + _bit_unpack_8_6bw_lane(in, out, reference, thread_idx * 4 + 2); + _bit_unpack_8_6bw_lane(in, out, reference, thread_idx * 4 + 3); } -extern "C" __global__ void bit_unpack_8_6bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_6bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_6bw_32t(in, out, thread_idx); + _bit_unpack_8_6bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_7bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_7bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; uint8_t src; uint8_t tmp; src = in[lane]; tmp = (src >> 0) & MASK(uint8_t, 7); - out[INDEX(0, lane)] = tmp; + out[INDEX(0, lane)] = tmp + reference; tmp = (src >> 7) & MASK(uint8_t, 1); src = in[lane + LANE_COUNT * 1]; tmp |= (src & MASK(uint8_t, 6)) << 1; - out[INDEX(1, lane)] = tmp; + out[INDEX(1, lane)] = tmp + reference; tmp = (src >> 6) & MASK(uint8_t, 2); src = in[lane + LANE_COUNT * 2]; tmp |= (src & MASK(uint8_t, 5)) << 2; - out[INDEX(2, lane)] = tmp; + out[INDEX(2, lane)] = tmp + reference; tmp = (src >> 5) & MASK(uint8_t, 3); src = in[lane + LANE_COUNT * 3]; tmp |= (src & MASK(uint8_t, 4)) << 3; - out[INDEX(3, lane)] = tmp; + out[INDEX(3, lane)] = tmp + reference; tmp = (src >> 4) & MASK(uint8_t, 4); src = in[lane + LANE_COUNT * 4]; tmp |= (src & MASK(uint8_t, 3)) << 4; - out[INDEX(4, lane)] = tmp; + out[INDEX(4, lane)] = tmp + reference; tmp = (src >> 3) & MASK(uint8_t, 5); src = in[lane + LANE_COUNT * 5]; tmp |= (src & MASK(uint8_t, 2)) << 5; - out[INDEX(5, lane)] = tmp; + out[INDEX(5, lane)] = tmp + reference; tmp = (src >> 2) & MASK(uint8_t, 6); src = in[lane + LANE_COUNT * 6]; tmp |= (src & MASK(uint8_t, 1)) << 6; - out[INDEX(6, lane)] = tmp; + out[INDEX(6, lane)] = tmp + reference; tmp = (src >> 1) & MASK(uint8_t, 7); - out[INDEX(7, lane)] = tmp; + out[INDEX(7, lane)] = tmp + reference; } -__device__ void _bit_unpack_8_7bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { - _bit_unpack_8_7bw_lane(in, out, thread_idx * 4 + 0); - _bit_unpack_8_7bw_lane(in, out, thread_idx * 4 + 1); - _bit_unpack_8_7bw_lane(in, out, thread_idx * 4 + 2); - _bit_unpack_8_7bw_lane(in, out, thread_idx * 4 + 3); +__device__ void _bit_unpack_8_7bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx) { + _bit_unpack_8_7bw_lane(in, out, reference, thread_idx * 4 + 0); + _bit_unpack_8_7bw_lane(in, out, reference, thread_idx * 4 + 1); + _bit_unpack_8_7bw_lane(in, out, reference, thread_idx * 4 + 2); + _bit_unpack_8_7bw_lane(in, out, reference, thread_idx * 4 + 3); } -extern "C" __global__ void bit_unpack_8_7bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_7bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_7bw_32t(in, out, thread_idx); + _bit_unpack_8_7bw_32t(in, out, reference, thread_idx); } -__device__ void _bit_unpack_8_8bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, unsigned int lane) { +__device__ void _bit_unpack_8_8bw_lane(const uint8_t *__restrict in, uint8_t *__restrict out, const uint8_t reference, unsigned int lane) { unsigned int LANE_COUNT = 128; - out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane]; - out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane]; - out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane]; - out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane]; - out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane]; - out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane]; - out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane]; - out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane]; + out[INDEX(0, lane)] = in[LANE_COUNT * 0 + lane] + reference; + out[INDEX(1, lane)] = in[LANE_COUNT * 1 + lane] + reference; + out[INDEX(2, lane)] = in[LANE_COUNT * 2 + lane] + reference; + out[INDEX(3, lane)] = in[LANE_COUNT * 3 + lane] + reference; + out[INDEX(4, lane)] = in[LANE_COUNT * 4 + lane] + reference; + out[INDEX(5, lane)] = in[LANE_COUNT * 5 + lane] + reference; + out[INDEX(6, lane)] = in[LANE_COUNT * 6 + lane] + reference; + out[INDEX(7, lane)] = in[LANE_COUNT * 7 + lane] + reference; } -__device__ void _bit_unpack_8_8bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, int thread_idx) { - _bit_unpack_8_8bw_lane(in, out, thread_idx * 4 + 0); - _bit_unpack_8_8bw_lane(in, out, thread_idx * 4 + 1); - _bit_unpack_8_8bw_lane(in, out, thread_idx * 4 + 2); - _bit_unpack_8_8bw_lane(in, out, thread_idx * 4 + 3); +__device__ void _bit_unpack_8_8bw_32t(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx) { + _bit_unpack_8_8bw_lane(in, out, reference, thread_idx * 4 + 0); + _bit_unpack_8_8bw_lane(in, out, reference, thread_idx * 4 + 1); + _bit_unpack_8_8bw_lane(in, out, reference, thread_idx * 4 + 2); + _bit_unpack_8_8bw_lane(in, out, reference, thread_idx * 4 + 3); } -extern "C" __global__ void bit_unpack_8_8bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out) { +extern "C" __global__ void bit_unpack_8_8bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference) { int thread_idx = threadIdx.x; auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint8_t))); auto out = full_out + (blockIdx.x * 1024); - _bit_unpack_8_8bw_32t(in, out, thread_idx); + _bit_unpack_8_8bw_32t(in, out, reference, thread_idx); } diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index 5c19f9a5e17..ad2a762b1ca 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -55,13 +55,15 @@ impl CudaExecute for BitPackedExecutor { Self::try_specialize(array).ok_or_else(|| vortex_err!("Expected BitPackedArray"))?; match_each_integer_ptype!(array.ptype(), |A| { - decode_bitpacked::(array, ctx).await + // We use a reference value of 0 since this is standard unpacking and not FFOR. + decode_bitpacked::(array, 0, ctx).await }) } } -async fn decode_bitpacked( +pub(crate) async fn decode_bitpacked( array: BitPackedArray, + reference: A, ctx: &mut CudaExecutionCtx, ) -> VortexResult where @@ -107,6 +109,7 @@ where // Build launch args: input, output, f, e, length launch_builder.arg(&input_view); launch_builder.arg(&output_view); + launch_builder.arg(&reference); let num_blocks = u32::try_from(len.div_ceil(1024))?; @@ -117,8 +120,10 @@ where }; // Launch kernel - let _cuda_events = + let cuda_events = launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_DISABLE_TIMING)?; + let duration = cuda_events.duration()?; + tracing::trace!(execution = ?duration, "FFOR kernel execution"); } let output_handle = match patches { diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs index 4a585f52f6e..e961cf6db9b 100644 --- a/vortex-cuda/src/kernel/encodings/for_.rs +++ b/vortex-cuda/src/kernel/encodings/for_.rs @@ -7,24 +7,31 @@ use async_trait::async_trait; use cudarc::driver::DeviceRepr; use cudarc::driver::PushKernelArg; use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; +use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::arrays::PrimitiveArray; use vortex_array::arrays::PrimitiveArrayParts; +use vortex_array::arrays::SliceVTable; use vortex_array::buffer::BufferHandle; use vortex_cuda_macros::cuda_tests; use vortex_dtype::NativePType; +use vortex_dtype::match_each_integer_ptype; use vortex_dtype::match_each_native_simd_ptype; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_err; +use vortex_fastlanes::BitPackedArray; +use vortex_fastlanes::BitPackedVTable; use vortex_fastlanes::FoRArray; use vortex_fastlanes::FoRVTable; +use vortex_fastlanes::unpack_iter::BitPacked; use crate::CudaBufferExt; use crate::executor::CudaArrayExt; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; +use crate::kernel::encodings::bitpacked::decode_bitpacked; use crate::launch_cuda_kernel_impl; /// CUDA decoder for frame-of-reference. @@ -46,6 +53,35 @@ impl CudaExecute for FoRExecutor { ) -> VortexResult { let array = Self::try_specialize(array).ok_or_else(|| vortex_err!("Expected FoRArray"))?; + // Fused kernel: FFOR + if let Some(bitpacked) = array.encoded().as_opt::() { + // If BP was chosen then we know we have integers. + match_each_integer_ptype!(array.ptype(), |P| { + // Doesn't this only work for unsigned ints? + let reference: P = P::try_from(array.reference_scalar())?; + return decode_ffor(bitpacked.clone(), reference, ctx) + .await? + .into_primitive() + .to_canonical(); + }); + } + + // Fused kernel: sliced FFOR + if let Some(sliced) = array.encoded().as_opt::() + && let Some(bitpacked) = sliced.child().as_opt::() + { + // Slice first. Defer slicing of patches until after. + // If BP was chosen then we know we have integers. + match_each_integer_ptype!(array.ptype(), |P| { + let reference: P = P::try_from(array.reference_scalar())?; + return decode_ffor(bitpacked.clone(), reference, ctx) + .await? + .into_primitive() + .slice(sliced.slice_range().clone())? + .to_canonical(); + }); + } + match_each_native_simd_ptype!(array.ptype(), |P| { decode_for::

(array, ctx).await }) } } @@ -91,8 +127,11 @@ where launch_builder.arg(&array_len_u64); // Launch kernel - let _cuda_events = + let cuda_events = launch_cuda_kernel_impl(&mut launch_builder, CU_EVENT_DISABLE_TIMING, array_len)?; + let duration = cuda_events.duration()?; + + tracing::trace!(execution = ?duration, "FOR kernel execution"); // Build result - in-place reuses the same buffer Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle( @@ -102,6 +141,20 @@ where ))) } +/// Decode a fused FOR + BP operation. +async fn decode_ffor( + bitpacked: BitPackedArray, + reference: T, + ctx: &mut CudaExecutionCtx, +) -> VortexResult +where + T: NativePType + DeviceRepr + BitPacked, + T::Physical: DeviceRepr + Send + Sync + 'static, +{ + // Turn this into a canonical + decode_bitpacked(bitpacked, reference, ctx).await +} + #[cuda_tests] mod tests { use rstest::rstest; diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs index 33233582116..14e42078f38 100644 --- a/vortex-cuda/src/session.rs +++ b/vortex-cuda/src/session.rs @@ -16,6 +16,7 @@ use crate::ExportDeviceArray; use crate::arrow::CanonicalDeviceArrayExport; use crate::executor::CudaExecute; pub use crate::executor::CudaExecutionCtx; +use crate::initialize_cuda; use crate::kernel::KernelLoader; use crate::stream::VortexCudaStream; use crate::stream_pool::VortexCudaStreamPool; @@ -128,7 +129,7 @@ impl CudaSession { } impl Default for CudaSession { - /// Creates a default CUDA session using device 0. + /// Creates a default CUDA session using device 0, with all GPU array kernels preloaded. /// /// # Panics /// @@ -136,7 +137,9 @@ impl Default for CudaSession { fn default() -> Self { #[expect(clippy::expect_used)] let context = CudaContext::new(0).expect("Failed to initialize CUDA device 0"); - Self::new(context) + let this = Self::new(context); + initialize_cuda(&this); + this } } diff --git a/vortex-cuda/src/stream.rs b/vortex-cuda/src/stream.rs index fad54b36ab4..29f05a890a0 100644 --- a/vortex-cuda/src/stream.rs +++ b/vortex-cuda/src/stream.rs @@ -12,6 +12,7 @@ use cudarc::driver::DevicePtrMut; use cudarc::driver::DeviceRepr; use cudarc::driver::result::memcpy_htod_async; use cudarc::driver::result::stream; +use futures::SinkExt; use futures::future::BoxFuture; use kanal::Sender; use vortex_array::buffer::BufferHandle; @@ -155,10 +156,11 @@ fn register_stream_callback(stream: &CudaStream) -> VortexResult) }; // Blocking send as we're in a callback invoked by the CUDA driver. - #[expect(clippy::expect_used)] - tx.send(()) - // A send should never fail. Panic otherwise. - .expect("CUDA callback receiver dropped unexpectedly"); + // NOTE: send can fail if the CudaEvent is dropped by the caller, in which case the reeciver + // is closed and sends will fail. + if let Err(e) = tx.send(()) { + tracing::warn!(error = ?e, "register_stream_callback send failed due to error"); + } } // SAFETY: diff --git a/vortex-python/src/arrow.rs b/vortex-python/src/arrow.rs index ece662ee80e..861602f08c9 100644 --- a/vortex-python/src/arrow.rs +++ b/vortex-python/src/arrow.rs @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2016-2025 Copyright The Apache Software Foundation // SPDX-FileCopyrightText: 2025 Copyright the Vortex contributors // SPDX-License-Identifier: Apache-2.0 -// SPDX-FileComment: Derived from upstream file arrow-pyarrow/src/lib.rs at commit 549709fb at https://github.com/apache/arrow-rs +// SPDX-FileComment: Derived from upstream file arrow-pyarrow/src/main at commit 549709fb at https://github.com/apache/arrow-rs // SPDX-FileNotice: https://github.com/apache/arrow-rs/blob/549709fbdf91cd1f6c263a7e4540c542b6fecf6b/NOTICE.txt #![allow(clippy::same_name_method)] diff --git a/vortex-test/e2e-cuda-scan/Cargo.toml b/vortex-test/e2e-cuda-scan/Cargo.toml new file mode 100644 index 00000000000..fb377c2b808 --- /dev/null +++ b/vortex-test/e2e-cuda-scan/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "vortex-test-e2e-cuda-scan" +authors = { workspace = true } +description = "CUDA scan testing" +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +publish = false +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] +arrow-array = { workspace = true, features = ["ffi"] } +arrow-schema = { workspace = true, features = ["ffi"] } +futures = { workspace = true, features = ["executor"] } +tokio = { workspace = true, features = ["macros", "full"]} +tracing-subscriber = {workspace = true, features = ["env-filter"]} +vortex = { workspace = true } +vortex-cuda = { workspace = true, features = ["_test-harness"] } diff --git a/vortex-test/e2e-cuda-scan/src/main.rs b/vortex-test/e2e-cuda-scan/src/main.rs new file mode 100644 index 00000000000..edb56274fa3 --- /dev/null +++ b/vortex-test/e2e-cuda-scan/src/main.rs @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::env::args; +use std::path::Path; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +use futures::StreamExt; +use tracing_subscriber::EnvFilter; +use vortex::VortexSessionDefault; +use vortex::array::Array; +use vortex::array::ToCanonical; +use vortex::buffer::ByteBuffer; +use vortex::buffer::ByteBufferMut; +use vortex::compressor::BtrBlocksCompressorBuilder; +use vortex::compressor::FloatCode; +use vortex::compressor::IntCode; +use vortex::compressor::StringCode; +use vortex::error::VortexResult; +use vortex::file::Footer; +use vortex::file::OpenOptionsSessionExt; +use vortex::file::WriteOptionsSessionExt; +use vortex::file::WriteStrategyBuilder; +use vortex::session::VortexSession; +use vortex_cuda::CopyDeviceReadAt; +use vortex_cuda::CudaSession; +use vortex_cuda::CudaSessionExt; +use vortex_cuda::VortexCudaStreamPool; +use vortex_cuda::executor::CudaArrayExt; + +#[tokio::main] +pub async fn main() -> VortexResult<()> { + tracing_subscriber::fmt() + .with_env_filter(EnvFilter::from_default_env()) + .init(); + + let session = VortexSession::default(); + vortex_cuda::initialize_cuda(&session.cuda_session()); + let mut cuda_ctx = CudaSession::create_execution_ctx(&session)?; + + let input_path = PathBuf::from(args().nth(1).expect("must provide path to .vortex file")); + + assert!(input_path.exists(), "input path does not exist"); + + let (recompressed, footer) = recompress_for_gpu(input_path, &session).await?; + + // Create a full scan that executes on the GPU + let cuda_stream = + VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?; + let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream); + + let gpu_file = session + .open_options() + .with_footer(footer) + .open(Arc::new(gpu_reader)) + .await?; + + // execute_micros => µs to execute + let mut batches = gpu_file.scan()?.into_array_stream()?; + + println!("column|chunk|row_count|encoding_tree|execution_micros"); + + let mut chunk = 0; + while let Some(next) = batches.next().await.transpose()? { + let record = next.to_struct(); + + for (field, field_name) in record + .unmasked_fields() + .iter() + .zip(record.struct_fields().names().iter()) + { + let len = field.len(); + + let start = Instant::now(); + // execute_cuda several times to get meaningful measurements. + // NOTE: is there any internal caching here? + for _ in 0..10 { + field.clone().execute_cuda(&mut cuda_ctx).await?; + } + let execute_micros = start.elapsed().as_micros(); + + let encoding = field + .display_tree_encodings_only() + .to_string() + .escape_default() + .to_string(); + + println!("{field_name}|{chunk}|{len}|{encoding}|{execute_micros}"); + } + + chunk += 1; + } + + Ok(()) +} + +// Dump the values out as a new Vortex file for analysis. + +/// Recompress the input file using only GPU-executable encodings, returning the file as an +/// in-memory byte array. +async fn recompress_for_gpu( + input_path: impl AsRef, + session: &VortexSession, +) -> VortexResult<(ByteBuffer, Footer)> { + // Setup the reader + let input = session.open_options().open_path(input_path).await?; + + // Build a scan to read all columns from the input, and recompress them using only GPU-compatible + // encodings. + let scan = input.scan()?.into_array_stream()?; + + // Rebuild a copy of the file that only uses GPU-compatible compression algorithms. + let compressor = BtrBlocksCompressorBuilder::empty() + .include_int([ + IntCode::Uncompressed, + IntCode::Constant, + IntCode::BitPacking, + IntCode::For, + IntCode::Sequence, + IntCode::ZigZag, + IntCode::Dict, + ]) + .include_float([ + FloatCode::Uncompressed, + FloatCode::Constant, + FloatCode::Alp, + FloatCode::AlpRd, + FloatCode::RunEnd, + ]) + // Don't compress strings, this is b/c we don't have any BtrBlocks encodings that support + // strings. + .include_string([StringCode::Uncompressed]) + .build(); + + // Read an input stream from a Vortex file. + let writer = WriteStrategyBuilder::default() + .with_compressor(compressor) + .build(); + + // Segment sink? + let mut out = ByteBufferMut::empty(); + let result = session + .write_options() + .with_strategy(writer) + .write(&mut out, scan) + .await?; + + Ok((out.freeze(), result.footer().clone())) +} diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs index 0da4a1a1b40..c989d634cb2 100644 --- a/vortex/src/lib.rs +++ b/vortex/src/lib.rs @@ -32,6 +32,10 @@ pub mod compute2 { pub mod compressor { pub use vortex_btrblocks::BtrBlocksCompressor; + pub use vortex_btrblocks::BtrBlocksCompressorBuilder; + pub use vortex_btrblocks::FloatCode; + pub use vortex_btrblocks::IntCode; + pub use vortex_btrblocks::StringCode; } pub mod dtype {