From 3fca7ce2610ddda0905529a71275792fe9424edd Mon Sep 17 00:00:00 2001 From: Tony Arcieri Date: Tue, 27 Jan 2026 17:51:46 -0700 Subject: [PATCH] polyval: implement powers-of-H for `soft` backend Loops over the input blocks performing `karatsuba` using powers-of-H and accumulating a wide product (in normal and bit-reversed form), then performing a final `mont_reduce`. This avoids performing a `mont_reduce` on each block (although to be fair, it just performs shifts/XORs and is not nearly as expensive as the multiplications in `karatsuba`). It could perhaps be improved by splitting `karatsuba` into `karatsuba1`/`karatsuba2` like the other backends and skipping the recombination/product assembly steps. This uses `1` as `FieldElement::DEFAULT_PARALLELISM` (which it seems was mistakenly set to `8` before) so this functionality is not on-by-default. It seems like it will probably not be much of a win without additional work. Even if it's not though, all backends now have the same structure and `soft` is not a weird special case when used with `N > 1`. Also adds a proptest that whatever parallel backend is in use produces equivalent results to a pure Rust serial implementation, using the `FieldElement` type's public API (namely `Add` and `Mul`) via the newly added `hazmat` feature. --- Cargo.lock | 311 ++++++++++++++++++++++++ polyval/Cargo.toml | 3 + polyval/src/field_element.rs | 10 +- polyval/src/field_element/autodetect.rs | 22 +- polyval/src/field_element/common.rs | 22 ++ polyval/src/field_element/soft.rs | 30 +-- polyval/src/lib.rs | 27 ++ polyval/tests/{lib.rs => long_input.rs} | 21 +- polyval/tests/proptests.rs | 34 +++ 9 files changed, 422 insertions(+), 58 deletions(-) create mode 100644 polyval/src/field_element/common.rs rename polyval/tests/{lib.rs => long_input.rs} (65%) create mode 100644 polyval/tests/proptests.rs diff --git a/Cargo.lock b/Cargo.lock index fbe5f71..e24a4f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,39 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bitflags" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + [[package]] name = "cpubits" version = "0.1.0-rc.1" @@ -26,6 +59,40 @@ dependencies = [ "hybrid-array", ] +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + [[package]] name = "ghash" version = "0.6.0-rc.3" @@ -56,6 +123,27 @@ version = "0.2.180" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" +[[package]] +name = "linux-raw-sys" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + [[package]] name = "poly1305" version = "0.9.0-rc.3" @@ -73,22 +161,186 @@ dependencies = [ "cpubits", "cpufeatures", "hex-literal", + "proptest", "universal-hash", "zeroize", ] +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "proptest" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee689443a2bd0a16ab0348b52ee43e3b2d1b1f931c8aa5c9f8de4c86fbe8c40" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quote" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + +[[package]] +name = "rustix" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "subtle" version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "2.0.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "typenum" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + +[[package]] +name = "unicode-ident" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" + [[package]] name = "universal-hash" version = "0.6.0-rc.8" @@ -99,6 +351,65 @@ dependencies = [ "subtle", ] +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" + +[[package]] +name = "zerocopy" +version = "0.8.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71ddd76bcebeed25db614f82bf31a9f4222d3fbba300e6fb6c00afa26cbd4d9d" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8187381b52e32220d50b255276aa16a084ec0a9017a0ca2152a1f55c539758d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zeroize" version = "1.8.2" diff --git a/polyval/Cargo.toml b/polyval/Cargo.toml index d34d6b7..7911135 100644 --- a/polyval/Cargo.toml +++ b/polyval/Cargo.toml @@ -29,6 +29,9 @@ cpufeatures = "0.2" [dev-dependencies] hex-literal = "1" +[target.'cfg(any(unix, windows))'.dev-dependencies] +proptest = "1.9" + [lints.rust] missing_copy_implementations = "warn" missing_debug_implementations = "warn" diff --git a/polyval/src/field_element.rs b/polyval/src/field_element.rs index aa5358b..d67a1d9 100644 --- a/polyval/src/field_element.rs +++ b/polyval/src/field_element.rs @@ -1,5 +1,6 @@ //! POLYVAL field element implementation. +mod common; mod soft; use crate::{BLOCK_SIZE, Block}; @@ -66,19 +67,16 @@ cfg_if! { impl FieldElement { /// Default degree of parallelism, i.e. how many powers of `H` to compute. - pub const DEFAULT_PARALLELISM: usize = 8; + pub const DEFAULT_PARALLELISM: usize = 1; - /// Stub implementation that works with `Polyval::h` even though we don't support - /// `proc_par_blocks`. #[inline] pub(crate) fn powers_of_h( self, _has_intrinsics: InitToken ) -> [Self; N] { - soft::powers_of_h(self) + common::powers_of_h(self, Mul::mul) } - /// Process an individual block. pub(crate) fn proc_block( h: FieldElement, y: FieldElement, @@ -88,8 +86,6 @@ cfg_if! { soft::proc_block(h, y, x) } - /// Process multiple blocks in parallel. - // TODO(tarcieri): currently just calls `proc_block` for each block on `soft`-only pub(crate) fn proc_par_blocks( powers_of_h: &[FieldElement; N], y: FieldElement, diff --git a/polyval/src/field_element/autodetect.rs b/polyval/src/field_element/autodetect.rs index ebb36c5..ea99e09 100644 --- a/polyval/src/field_element/autodetect.rs +++ b/polyval/src/field_element/autodetect.rs @@ -6,8 +6,9 @@ use super::armv8 as intrinsics; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use super::x86 as intrinsics; -use super::{FieldElement, soft}; +use super::{FieldElement, common, soft}; use crate::Block; +use core::ops::Mul; use universal_hash::array::{Array, ArraySize}; #[cfg(target_arch = "aarch64")] @@ -25,20 +26,12 @@ impl FieldElement { #[inline] pub(crate) fn powers_of_h(self, has_intrinsics: InitToken) -> [Self; N] { if has_intrinsics.get() { - // TODO: improve pipelining by using more square operations? - let mut pow = [Self::default(); N]; - let mut prev = self; - - for (i, v) in pow.iter_mut().rev().enumerate() { - *v = self; - if i > 0 { - *v = unsafe { intrinsics::polymul((*v).into(), prev.into()) }.into(); - } - prev = *v; - } - pow + // SAFETY: we only need to ensure we have intrinsics, which we just did + common::powers_of_h(self, |a, b| unsafe { + intrinsics::polymul(a.into(), b.into()).into() + }) } else { - soft::powers_of_h(self) + common::powers_of_h(self, Mul::mul) } } @@ -68,7 +61,6 @@ impl FieldElement { // SAFETY: we have checked the CPU has the necessary intrinsics above unsafe { intrinsics::proc_par_blocks(powers_of_h, y, blocks) } } else { - // TODO(tarcieri): currently just calls `proc_block` for each block on `soft`-only soft::proc_par_blocks(powers_of_h, y, blocks) } } diff --git a/polyval/src/field_element/common.rs b/polyval/src/field_element/common.rs new file mode 100644 index 0000000..0313840 --- /dev/null +++ b/polyval/src/field_element/common.rs @@ -0,0 +1,22 @@ +/// Compute the first `N` powers of `h`, in reverse order. +/// +/// Implemented generically so it can be shared by software and SIMD implementations. +#[inline] +pub(super) fn powers_of_h(h: T, mul: Mul) -> [T; N] +where + T: Copy, + Mul: Fn(T, T) -> T, +{ + let mut pow = [h; N]; + + // TODO: improve pipelining by using more square operations? + let mut prev = h; + for (i, v) in pow.iter_mut().rev().enumerate() { + *v = h; + if i > 0 { + *v = mul(*v, prev); + } + prev = *v; + } + pow +} diff --git a/polyval/src/field_element/soft.rs b/polyval/src/field_element/soft.rs index c49a937..ddf5880 100644 --- a/polyval/src/field_element/soft.rs +++ b/polyval/src/field_element/soft.rs @@ -37,15 +37,6 @@ use core::{ use soft_impl::{karatsuba, mont_reduce}; use universal_hash::array::{Array, ArraySize}; -/// Stub implementation which only makes `PolyvalGeneric::h` work. -// TODO(tarcieri): actually implement this optimization? -#[inline] -pub(super) fn powers_of_h(h: FieldElement) -> [FieldElement; N] { - let mut ret = [FieldElement::default(); N]; - ret[N - 1] = h; - ret -} - /// Perform carryless multiplication of `y` by `h` and return the result. #[inline] pub(super) fn polymul(y: FieldElement, h: FieldElement) -> FieldElement { @@ -54,25 +45,30 @@ pub(super) fn polymul(y: FieldElement, h: FieldElement) -> FieldElement { } /// Process an individual block. -// TODO(tarcieri): implement `proc_par_blocks` for soft backend? #[inline] pub(super) fn proc_block(h: FieldElement, y: FieldElement, x: &Block) -> FieldElement { - let x = FieldElement::from(x); - polymul(y + x, h) + polymul(y + x.into(), h) } /// Process multiple blocks. -// TODO(tarcieri): optimized implementation? #[inline] pub(super) fn proc_par_blocks( powers_of_h: &[FieldElement; N], - mut y: FieldElement, + y: FieldElement, blocks: &Array, ) -> FieldElement { - for block in blocks.iter() { - y = proc_block(powers_of_h[N - 1], y, block); + // First block + let mut v = karatsuba(y + blocks[0].into(), powers_of_h[0]); + + // Remaining blocks + for i in 1..blocks.len() { + let v2 = karatsuba(blocks[i].into(), powers_of_h[i]); + for (a, b) in v.iter_mut().zip(v2.iter()) { + *a ^= b; + } } - y + + mont_reduce(v) } /// Multiplication in GF(2)[X], implemented generically and wrapped as `bmul32` and `bmul64`. diff --git a/polyval/src/lib.rs b/polyval/src/lib.rs index 4116d10..e6df48d 100644 --- a/polyval/src/lib.rs +++ b/polyval/src/lib.rs @@ -165,3 +165,30 @@ impl Debug for PolyvalGeneric { write!(f, "PolyvalGeneric<{}> {{ ... }}", N) } } + +#[cfg(test)] +mod tests { + use crate::{BLOCK_SIZE, Polyval, universal_hash::UniversalHash}; + use hex_literal::hex; + + // + // Test vectors for POLYVAL from RFC 8452 Appendix A + // + // + + const H: [u8; BLOCK_SIZE] = hex!("25629347589242761d31f826ba4b757b"); + const X_1: [u8; BLOCK_SIZE] = hex!("4f4f95668c83dfb6401762bb2d01a262"); + const X_2: [u8; BLOCK_SIZE] = hex!("d1a24ddd2721d006bbe45f20d3c9f362"); + + /// POLYVAL(H, X_1, X_2) + const POLYVAL_RESULT: [u8; BLOCK_SIZE] = hex!("f7a3b47b846119fae5b7866cf5e5b77e"); + + #[test] + fn polyval_test_vector() { + let mut poly = Polyval::new(&H.into()); + poly.update(&[X_1.into(), X_2.into()]); + + let result = poly.finalize(); + assert_eq!(&POLYVAL_RESULT[..], result.as_slice()); + } +} diff --git a/polyval/tests/lib.rs b/polyval/tests/long_input.rs similarity index 65% rename from polyval/tests/lib.rs rename to polyval/tests/long_input.rs index 9190ea4..27bb4e3 100644 --- a/polyval/tests/lib.rs +++ b/polyval/tests/long_input.rs @@ -1,8 +1,8 @@ -//! POLYVAL integration tests. +//! Longer test cases to ensure that long-input optimizations behave correctly. use hex_literal::hex; use polyval::{ - BLOCK_SIZE, Polyval, PolyvalGeneric, + BLOCK_SIZE, PolyvalGeneric, universal_hash::{KeyInit, Reset, UniversalHash, common::KeySizeUser, typenum::U16}, }; @@ -12,23 +12,6 @@ use polyval::{ // const H: [u8; BLOCK_SIZE] = hex!("25629347589242761d31f826ba4b757b"); -const X_1: [u8; BLOCK_SIZE] = hex!("4f4f95668c83dfb6401762bb2d01a262"); -const X_2: [u8; BLOCK_SIZE] = hex!("d1a24ddd2721d006bbe45f20d3c9f362"); - -/// POLYVAL(H, X_1, X_2) -const POLYVAL_RESULT: [u8; BLOCK_SIZE] = hex!("f7a3b47b846119fae5b7866cf5e5b77e"); - -#[test] -fn polyval_test_vector() { - let mut poly = Polyval::new(&H.into()); - poly.update(&[X_1.into(), X_2.into()]); - - let result = poly.finalize(); - assert_eq!(&POLYVAL_RESULT[..], result.as_slice()); -} - -// A longer test case, to ensure that long-input optimizations -// behave correctly. fn longer_test() where diff --git a/polyval/tests/proptests.rs b/polyval/tests/proptests.rs new file mode 100644 index 0000000..131327e --- /dev/null +++ b/polyval/tests/proptests.rs @@ -0,0 +1,34 @@ +//! Property-based tests. + +#![cfg(all(any(unix, windows), feature = "hazmat"))] + +use polyval::{BLOCK_SIZE, Block, KEY_SIZE, hazmat::FieldElement, universal_hash::UniversalHash}; +use proptest::prelude::*; + +/// Number of blocks to compute in parallel +const PARALLEL_BLOCKS: usize = 8; + +/// Ensure we're always parallel +type ParallelPolyval = polyval::PolyvalGeneric; + +proptest! { + /// Test explicitly parallel implementation for equivalence to the `soft` backend (which is what + /// powers `Add`/`Mul` trait impls on `FieldElement`. + #[test] + fn par_soft_equivalence( + key in any::<[u8; KEY_SIZE]>(), + data in any::<[u8; BLOCK_SIZE * PARALLEL_BLOCKS]>() + ) { + let mut polyval = ParallelPolyval::new(&key.into()); + polyval.update_padded(&data); + let actual = polyval.finalize(); + + let h = FieldElement::from(key); + let mut y = FieldElement::default(); + for block in Block::slice_as_chunks(&data).0 { + y = (y + block.into()) * h; + } + + prop_assert_eq!(actual, Block::from(y)); + } +}