diff --git a/Cargo.lock b/Cargo.lock index 88f1028d8..48206bddc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -810,6 +810,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" name = "eip-152" version = "0.1.0" dependencies = [ + "arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "rustc-hex 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", ] diff --git a/util/EIP-152/Cargo.toml b/util/EIP-152/Cargo.toml index fe65d0110..fb474919f 100644 --- a/util/EIP-152/Cargo.toml +++ b/util/EIP-152/Cargo.toml @@ -12,3 +12,11 @@ edition = "2018" [dependencies] rustc-hex = "2.0.1" +arrayref = "0.3.5" + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "bench" +harness = false diff --git a/util/EIP-152/LICENSE b/util/EIP-152/LICENSE new file mode 100644 index 000000000..f5ab386d4 --- /dev/null +++ b/util/EIP-152/LICENSE @@ -0,0 +1,25 @@ +This program is copyright 2019 Parity Technologies Limited and its licensors. + + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + +Some portions of the program (“the Software”) are Copyright (c) 2018 Jack O'Connor +and the following relates solely to such portions: + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/util/EIP-152/benches/bench.rs b/util/EIP-152/benches/bench.rs new file mode 100644 index 000000000..8f6278af8 --- /dev/null +++ b/util/EIP-152/benches/bench.rs @@ -0,0 +1,191 @@ +// Copyright 2015-2019 Parity Technologies (UK) Ltd. +// This file is part of Parity Ethereum. + +// Parity Ethereum is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Parity Ethereum is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Parity Ethereum. If not, see . + + +use criterion::{Criterion, criterion_group, criterion_main, black_box, Throughput, BenchmarkId}; +use std::mem; +use std::sync::atomic::{AtomicPtr, Ordering}; +use eip_152::portable; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use eip_152::avx2; + +type FnRaw = *mut (); +type Blake2bF = fn(&mut [u64; 8], [u64; 16], [u64; 2], bool, usize); + +static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw); + +fn detect(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) { + let fun = if is_x86_feature_detected!("avx2") { + avx2::compress as FnRaw + } else { + portable::compress as FnRaw + }; + FN.store(fun as FnRaw, Ordering::Relaxed); + unsafe { + mem::transmute::(fun)(state, message, count, f, rounds) + } +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn avx_ifunc_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("avx2_ifunc"); + + for rounds in [12, 50, 100].iter() { + group.throughput(Throughput::Elements(*rounds as u64)); + group.bench_with_input( + BenchmarkId::new("rounds", rounds), + &rounds, + |b, rounds| { + let mut state = [ + 0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64, + 0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64, + 0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64, + 0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64, + ]; + + let message = [ + 0x0000000000636261_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + ]; + let count = [3, 0]; + let f = true; + + b.iter(|| { + unsafe { + let fun = FN.load(Ordering::Relaxed); + mem::transmute:: + (fun) + ( + black_box(&mut state), + black_box(message), + black_box(count), + black_box(f), + black_box(**rounds as usize), + ); + } + }); + }, + ); + } + + group.finish(); +} + + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn avx_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("avx2"); + + for rounds in [12, 50, 100].iter() { + group.throughput(Throughput::Elements(*rounds as u64)); + group.bench_with_input( + BenchmarkId::new("rounds", rounds), + &rounds, + |b, rounds| { + let mut state = [ + 0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64, + 0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64, + 0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64, + 0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64, + ]; + + let message = [ + 0x0000000000636261_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + ]; + let count = [3, 0]; + let f = true; + + b.iter(|| { + + unsafe { + avx2::compress( + black_box(&mut state), + black_box(message), + black_box(count), + black_box(f), + black_box(**rounds as usize), + ); + } + }); + }, + ); + } + + group.finish(); +} + + +pub fn portable_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("portable_impl"); + + for rounds in [12, 50, 100].iter() { + group.throughput(Throughput::Elements(*rounds as u64)); + group.bench_with_input( + BenchmarkId::new("rounds", rounds), + &rounds, + |b, rounds| { + let mut state = [ + 0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64, + 0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64, + 0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64, + 0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64, + ]; + + let message = [ + 0x0000000000636261_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + 0x0000000000000000_u64, 0x0000000000000000_u64, + ]; + let count = [3, 0]; + let f = true; + + b.iter(|| { + portable::compress( + black_box(&mut state), + black_box(message), + black_box(count), + black_box(f), + black_box(**rounds as usize), + ); + }); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, avx_benchmark, avx_ifunc_benchmark, portable_benchmark); +criterion_main!(benches); \ No newline at end of file diff --git a/util/EIP-152/src/avx2.rs b/util/EIP-152/src/avx2.rs new file mode 100644 index 000000000..a619698bd --- /dev/null +++ b/util/EIP-152/src/avx2.rs @@ -0,0 +1,471 @@ +// Copyright 2015-2019 Parity Technologies (UK) Ltd. +// This file is part of Parity Ethereum. + +// Parity Ethereum is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Parity Ethereum is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Parity Ethereum. If not, see . + +//! AVX2 implementation of the blake2b compression function. +use crate::IV; + +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; +use arrayref::{array_refs, mut_array_refs}; + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +/// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2 +/// Takes as an argument the state vector `state`, message block vector `message`, offset counter, final +/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first +/// parameter is modified by the function. +/// +/// `g1` only operates on `x` from the original g function. +/// ``` +/// fn portable_g1(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64) { +/// v[a] = v[a].wrapping_add(v[b]).wrapping_add(x); +/// v[d] = (v[d] ^ v[a]).rotate_right(32); +/// v[c] = v[c].wrapping_add(v[d]); +/// v[b] = (v[b] ^ v[c]).rotate_right(24); +/// } +/// ``` +/// +/// `g2` only operates on `y` from the originial g function. +/// ``` +/// fn portable_g2(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, y: u64) { +/// v[a] = v[a].wrapping_add(v[b]).wrapping_add(y); +/// v[d] = (v[d] ^ v[a]).rotate_right(16); +/// v[c] = v[c].wrapping_add(v[d]); +/// v[b] = (v[b] ^ v[c]).rotate_right(63); +/// } +/// ``` +/// +/// Message mixing is done based on sigma values, for a given round. +/// +/// # Example +/// +/// `SIGMA` for round 1 i.e `SIGMA[0]` = `[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]`; +/// ``` +/// let s = &SIGMA[0 % 10]; +/// // a, b, c, d, x +/// g(&mut v, 0, 4, 8 , 12, m[s[0]]); +/// g(&mut v, 1, 5, 9 , 13, m[s[2]]); +/// g(&mut v, 2, 6, 10, 14, m[s[4]]); +/// g(&mut v, 3, 7, 11, 15, m[s[6]]); +/// +/// let a = v[..4]; +/// let b = v[4..8]; +/// let c = v[8..12]; +/// let d = v[12..16]; +/// let mut b0 = [m[0], m[2], m[4], m[6]]; +/// +/// g1(&mut a, &mut b, &mut c, &mut d, &mut b0); +/// // ... then contruct b0 for `g2` etc. +/// ``` +/// +#[target_feature(enable = "avx2")] +pub unsafe fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) { + // get a mutable reference to state[0..4], state[4..] + let (state_low, state_high) = mut_array_refs!(state, 4, 4); + // get a reference to IV[0..4], IV[4..] + let (iv_low, iv_high) = array_refs!(&IV, 4, 4); + + // loads them into an __m256i + let mut a = loadu(state_low); + let mut b = loadu(state_high); + let mut c = loadu(iv_low); + + // !a = xor(a, xor(a, !a)) + let inverse = if f { + iv_high[3] ^ !iv_high[3] + } else { + 0 + }; + + let flags = set4( + count[0], + count[1], + inverse, + 0, + ); + + let mut d = xor(loadu(iv_high), flags); + + // get a reference to message[(0..2)+,] + let msg_chunks = array_refs!(&message, 2, 2, 2, 2, 2, 2, 2, 2); + // load each message [u64; 2] into an __m128i, broadcast it into both lanes of an __m256i. + + // m0 = __m256i([message[0], message[1], message[0], message[1]]) + let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0)); + // m1 = __m256i([message[2], message[3], message[2], message[3]]) + let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1)); + // m2 = __m256i([message[4], message[5], message[4], message[5]]) + let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2)); + // m3 = __m256i([message[6], message[7], message[6], message[7]]) + let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3)); + // m4 = __m256i([message[8], message[9], message[8], message[9]]) + let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4)); + // m5 = __m256i([message[10], message[11], message[10], message[11]]) + let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5)); + // m6 = __m256i([message[12], message[13], message[12], message[13]]) + let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6)); + // m7 = __m256i([message[14], message[15], message[14], message[15]]) + let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7)); + + let iv0 = a; + let iv1 = b; + + let mut t0; + let mut t1; + let mut b0; + + for i in 0..rounds { + match i % 10 { + 0 => { + t0 = _mm256_unpacklo_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [0, 2, 0, 2] + t1 = _mm256_unpacklo_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [4, 6, 4, 6] + b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([0, 2, 0, 2], [4, 6, 4, 6]) = [0, 2, 4, 6] + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [1, 3, 1, 3] + t1 = _mm256_unpackhi_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [5, 7, 5, 7] + b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([1, 3, 1, 3], [5, 7, 5, 7]) = [1, 3, 5, 7] + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpacklo_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [14, 8, 14, 8] + t1 = _mm256_unpacklo_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [10, 12, 10, 12] + b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([14, 8, 14, 8], [10, 12, 10, 12]) = [14, 8, 10, 12] + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [15, 9, 15, 9] + t1 = _mm256_unpackhi_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [11, 13, 11, 13] + b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([15, 9, 15, 9], [11, 13, 11, 13]) = [15, 9, 11, 13] + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + } + 1 => { + t0 = _mm256_unpacklo_epi64(m7, m2); + t1 = _mm256_unpackhi_epi64(m4, m6); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m5, m4); + t1 = _mm256_alignr_epi8(m3, m7, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpackhi_epi64(m2, m0); + t1 = _mm256_blend_epi32(m5, m0, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_alignr_epi8(m6, m1, 8); + t1 = _mm256_blend_epi32(m3, m1, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + } + 2 => { + // round 3 + t0 = _mm256_alignr_epi8(m6, m5, 8); + t1 = _mm256_unpackhi_epi64(m2, m7); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m4, m0); + t1 = _mm256_blend_epi32(m6, m1, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_alignr_epi8(m5, m4, 8); + t1 = _mm256_unpackhi_epi64(m1, m3); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m2, m7); + t1 = _mm256_blend_epi32(m0, m3, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + } + 3 => { + // round 4 + t0 = _mm256_unpackhi_epi64(m3, m1); + t1 = _mm256_unpackhi_epi64(m6, m5); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m4, m0); + t1 = _mm256_unpacklo_epi64(m6, m7); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_alignr_epi8(m1, m7, 8); + t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2)); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m4, m3); + t1 = _mm256_unpacklo_epi64(m5, m0); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + } + 4 => { + // round 5 + t0 = _mm256_unpackhi_epi64(m4, m2); + t1 = _mm256_unpacklo_epi64(m1, m5); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_blend_epi32(m3, m0, 0x33); + t1 = _mm256_blend_epi32(m7, m2, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_alignr_epi8(m7, m1, 8); + t1 = _mm256_alignr_epi8(m3, m5, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m6, m0); + t1 = _mm256_unpacklo_epi64(m6, m4); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + } + 5 => { + // round 6 + t0 = _mm256_unpacklo_epi64(m1, m3); + t1 = _mm256_unpacklo_epi64(m0, m4); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m6, m5); + t1 = _mm256_unpackhi_epi64(m5, m1); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_alignr_epi8(m2, m0, 8); + t1 = _mm256_unpackhi_epi64(m3, m7); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m4, m6); + t1 = _mm256_alignr_epi8(m7, m2, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + } + 6 => { + // round 7 + t0 = _mm256_blend_epi32(m0, m6, 0x33); + t1 = _mm256_unpacklo_epi64(m7, m2); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m2, m7); + t1 = _mm256_alignr_epi8(m5, m6, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpacklo_epi64(m4, m0); + t1 = _mm256_blend_epi32(m4, m3, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m5, m3); + t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2)); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + } + 7 => { + // round 8 + t0 = _mm256_unpackhi_epi64(m6, m3); + t1 = _mm256_blend_epi32(m1, m6, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_alignr_epi8(m7, m5, 8); + t1 = _mm256_unpackhi_epi64(m0, m4); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_blend_epi32(m2, m1, 0x33); + t1 = _mm256_alignr_epi8(m4, m7, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m5, m0); + t1 = _mm256_unpacklo_epi64(m2, m3); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + } + 8 => { + // round 9 + t0 = _mm256_unpacklo_epi64(m3, m7); + t1 = _mm256_alignr_epi8(m0, m5, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpackhi_epi64(m7, m4); + t1 = _mm256_alignr_epi8(m4, m1, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpacklo_epi64(m5, m6); + t1 = _mm256_unpackhi_epi64(m6, m0); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_alignr_epi8(m1, m2, 8); + t1 = _mm256_alignr_epi8(m2, m3, 8); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + } + _ => { + // round 10 + t0 = _mm256_unpacklo_epi64(m5, m4); + t1 = _mm256_unpackhi_epi64(m3, m0); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_unpacklo_epi64(m1, m2); + t1 = _mm256_blend_epi32(m2, m3, 0x33); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + diagonalize(&mut a, &mut b, &mut c, &mut d); + t0 = _mm256_unpackhi_epi64(m6, m7); + t1 = _mm256_unpackhi_epi64(m4, m1); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g1(&mut a, &mut b, &mut c, &mut d, &mut b0); + t0 = _mm256_blend_epi32(m5, m0, 0x33); + t1 = _mm256_unpacklo_epi64(m7, m6); + b0 = _mm256_blend_epi32(t0, t1, 0xF0); + g2(&mut a, &mut b, &mut c, &mut d, &mut b0); + undiagonalize(&mut a, &mut b, &mut c, &mut d); + } + } + } + + a = xor(a, c); + b = xor(b, d); + a = xor(a, iv0); + b = xor(b, iv1); + + storeu(a, state_low); + storeu(b, state_high); +} + + +#[inline(always)] +unsafe fn loadu(src: *const [u64; 4]) -> __m256i { + // This is an unaligned load, so the pointer cast is allowed. + _mm256_loadu_si256(src as *const __m256i) +} + +#[inline(always)] +unsafe fn storeu(src: __m256i, dest: *mut [u64; 4]) { + // This is an unaligned store, so the pointer cast is allowed. + _mm256_storeu_si256(dest as *mut __m256i, src) +} + +#[inline(always)] +unsafe fn loadu_128(mem_addr: &[u64; 2]) -> __m128i { + _mm_loadu_si128(mem_addr.as_ptr() as *const __m128i) +} + +#[inline(always)] +unsafe fn add(a: __m256i, b: __m256i) -> __m256i { + _mm256_add_epi64(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { + _mm256_xor_si256(a, b) +} + +#[inline(always)] +unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i { + _mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64) +} + +#[inline(always)] +unsafe fn rotate_right_32(x: __m256i) -> __m256i { + _mm256_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1)) +} + +#[inline(always)] +unsafe fn rotate_right_24(x: __m256i) -> __m256i { + let rotate24 = _mm256_setr_epi8( + 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, + 14, 15, 8, 9, 10, + ); + _mm256_shuffle_epi8(x, rotate24) +} + +#[inline(always)] +unsafe fn rotate_right_16(x: __m256i) -> __m256i { + let rotate16 = _mm256_setr_epi8( + 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, + 13, 14, 15, 8, 9, + ); + _mm256_shuffle_epi8(x, rotate16) +} + +#[inline(always)] +unsafe fn rotate_right_63(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi64(x, 63), add(x, x)) +} + +#[inline(always)] +unsafe fn g1(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) { + *a = add(*a, *m); + *a = add(*a, *b); + *d = xor(*d, *a); + *d = rotate_right_32(*d); + *c = add(*c, *d); + *b = xor(*b, *c); + *b = rotate_right_24(*b); +} + +#[inline(always)] +unsafe fn g2(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) { + *a = add(*a, *m); + *a = add(*a, *b); + *d = xor(*d, *a); + *d = rotate_right_16(*d); + *c = add(*c, *d); + *b = xor(*b, *c); + *b = rotate_right_63(*b); +} + +// Note the optimization here of leaving b as the unrotated row, rather than a. +// All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) { + *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3)); + *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2)); + *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +// Note the optimization here of leaving b as the unrotated row, rather than a. +// All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) { + *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1)); + *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2)); + *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3)); +} + + +#[cfg(test)] +mod tests { + #[test] + fn test_mm_shuffle() { + assert_eq!(_MM_SHUFFLE!(0, 1, 1, 3), 0b00_01_01_11); + assert_eq!(_MM_SHUFFLE!(3, 1, 1, 0), 0b11_01_01_00); + assert_eq!(_MM_SHUFFLE!(1, 2, 2, 1), 0b01_10_10_01); + } +} diff --git a/util/EIP-152/src/lib.rs b/util/EIP-152/src/lib.rs index fd68b9072..1e33842cd 100644 --- a/util/EIP-152/src/lib.rs +++ b/util/EIP-152/src/lib.rs @@ -14,20 +14,24 @@ // You should have received a copy of the GNU General Public License // along with Parity Ethereum. If not, see . +pub mod portable; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub mod avx2; + /// The precomputed values for BLAKE2b [from the spec](https://tools.ietf.org/html/rfc7693#section-2.7) /// There are 10 16-byte arrays - one for each round /// the entries are calculated from the sigma constants. const SIGMA: [[usize; 16]; 10] = [ - [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], - [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4], - [ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8], - [ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13], - [ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9], - [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11], - [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10], - [ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5], - [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0], + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3], + [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4], + [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8], + [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13], + [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9], + [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11], + [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10], + [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5], + [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0], ]; @@ -38,58 +42,30 @@ const IV: [u64; 8] = [ 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, ]; - -#[inline(always)] -/// The G mixing function. See https://tools.ietf.org/html/rfc7693#section-3.1 -fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) { - v[a] = v[a].wrapping_add(v[b]).wrapping_add(x); - v[d] = (v[d] ^ v[a]).rotate_right(32); - v[c] = v[c].wrapping_add(v[d]); - v[b] = (v[b] ^ v[c]).rotate_right(24); - v[a] = v[a].wrapping_add(v[b]).wrapping_add(y); - v[d] = (v[d] ^ v[a]).rotate_right(16); - v[c] = v[c].wrapping_add(v[d]); - v[b] = (v[b] ^ v[c]).rotate_right(63); -} - -/// The Blake2 compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2 -/// Takes as an argument the state vector `h`, message block vector `m`, offset counter `t`, final -/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first -/// parameter is modified by the function. -pub fn compress(h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool, rounds: usize) { - let mut v = [0u64; 16]; - v[..h.len()].copy_from_slice(h); // First half from state. - v[h.len()..].copy_from_slice(&IV); // Second half from IV. - - v[12] ^= t[0]; - v[13] ^= t[1]; - - if f { - v[14] = !v[14] // Invert all bits if the last-block-flag is set. - } - for i in 0..rounds { - // Message word selection permutation for this round. - let s = &SIGMA[i % 10]; - g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]); - g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]); - g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]); - g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]); - - g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]); - g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]); - g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]); - g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]); +/// blake2b compression function +pub fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx2") { + unsafe { + return avx2::compress(state, message, count, f, rounds) + } + } else { + return portable::compress(state, message, count, f, rounds) + }; } - for i in 0..8 { - h[i] ^= v[i] ^ v[i + 8]; - } + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + portable::compress(state, message, count, f, rounds); } #[cfg(test)] mod tests { - use crate::compress; + use crate::portable; + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + use crate::avx2; use rustc_hex::FromHex; #[test] @@ -119,9 +95,27 @@ mod tests { 0x5A92F1DBA88AD318_u64, 0x239900D4ED8623B9_u64, ]; - compress(&mut h_in, m, c, f, rounds); - + // portable + portable::compress(&mut h_in, m, c, f, rounds); assert_eq!(h_in, h_out); + + let mut h_in = [ + 0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64, + 0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64, + 0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64, + 0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64, + ]; + + // avx + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("avx2") { + unsafe { + avx2::compress(&mut h_in, m, c, f, rounds); + assert_eq!(h_in, h_out); + } + } + } } fn to_u64_slice(vec: &[u8], slice: &mut [u64]) { @@ -130,6 +124,7 @@ mod tests { }) } + #[test] fn test_vectors_from_eip() { let vec = vec![ @@ -178,15 +173,27 @@ mod tests { to_u64_slice(&bytes[4..68], &mut h); to_u64_slice(&bytes[68..196], &mut m); to_u64_slice(&bytes[196..212], &mut t); - - compress(&mut h, m, t, f, rounds as usize); - let output: Vec = output.from_hex().unwrap(); - let mut out = [0u64; 8]; to_u64_slice(&output[..], &mut out); - assert_eq!(out, h); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + // avx + if is_x86_feature_detected!("avx2") { + unsafe { + avx2::compress(&mut h, m, t, f, rounds as usize); + assert_eq!(out, h); + } + } + } + + { + // portable + to_u64_slice(&bytes[4..68], &mut h); + portable::compress(&mut h, m, t, f, rounds as usize); + assert_eq!(out, h); + } } } } diff --git a/util/EIP-152/src/portable.rs b/util/EIP-152/src/portable.rs new file mode 100644 index 000000000..f0b8ac9d7 --- /dev/null +++ b/util/EIP-152/src/portable.rs @@ -0,0 +1,67 @@ +// Copyright 2015-2019 Parity Technologies (UK) Ltd. +// This file is part of Parity Ethereum. + +// Parity Ethereum is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Parity Ethereum is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Parity Ethereum. If not, see . +//! Portable implementation of the blake2b compress function + +use crate::{IV, SIGMA}; + +/// The G mixing function. See https://tools.ietf.org/html/rfc7693#section-3.1 +#[inline(always)] +fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) { + v[a] = v[a].wrapping_add(v[b]).wrapping_add(x); + v[d] = (v[d] ^ v[a]).rotate_right(32); + v[c] = v[c].wrapping_add(v[d]); + v[b] = (v[b] ^ v[c]).rotate_right(24); + + v[a] = v[a].wrapping_add(v[b]).wrapping_add(y); + v[d] = (v[d] ^ v[a]).rotate_right(16); + v[c] = v[c].wrapping_add(v[d]); + v[b] = (v[b] ^ v[c]).rotate_right(63); +} + +/// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2 +/// Takes as an argument the state vector `h`, message block vector `m`, offset counter `t`, final +/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first +/// parameter is modified by the function. +pub fn compress(h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool, rounds: usize) { + let mut v = [0u64; 16]; + v[..8].copy_from_slice(h); // First half from state. + v[8..].copy_from_slice(&IV); // Second half from IV. + + v[12] ^= t[0]; + v[13] ^= t[1]; + + if f { + v[14] = !v[14]; // Invert all bits if the last-block-flag is set. + } + + for i in 0..rounds { + // Message word selection permutation for this round. + let s = &SIGMA[i % 10]; + g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]); + g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]); + g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]); + g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]); + + g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]); + g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]); + g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]); + g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]); + } + + for i in 0..8 { + h[i] ^= v[i] ^ v[i + 8]; + } +}