SIMD Implementation for EIP-152 (#11056)

* simd implementation * adds benchmarks * Update util/EIP-152/src/avx.rs Co-Authored-By: David <dvdplm@gmail.com> * enable avx 😅 * better benchmark, docs * rename avx to avx2, compile on android * fix android again * remove ifunc, code formatting * license * nits * docs, nits * fix test
2019-10-02 14:32:21 +01:00 · 2019-10-02 14:32:21 +01:00 · f24bff5998
commit f24bff5998
parent ad633de6d9
7 changed files with 833 additions and 62 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -810,6 +810,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 name = "eip-152"
 version = "0.1.0"
 dependencies = [
 "arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
 "criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "rustc-hex 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
--- a/util/EIP-152/Cargo.toml
+++ b/util/EIP-152/Cargo.toml
@ -12,3 +12,11 @@ edition = "2018"
 [dependencies]
 rustc-hex = "2.0.1"
 arrayref = "0.3.5"
 [dev-dependencies]
 criterion = "0.3"
 [[bench]]
 name = "bench"
 harness = false
--- a/util/EIP-152/LICENSE
+++ b/util/EIP-152/LICENSE
@ -0,0 +1,25 @@
 This program is copyright 2019 Parity Technologies Limited and its licensors.
                         GNU GENERAL PUBLIC LICENSE
                            Version 3, 29 June 2007
 Some portions of the program (“the Software”) are Copyright (c) 2018 Jack O'Connor
 and the following relates solely to such portions:
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/util/EIP-152/benches/bench.rs
+++ b/util/EIP-152/benches/bench.rs
@ -0,0 +1,191 @@
 // Copyright 2015-2019 Parity Technologies (UK) Ltd.
 // This file is part of Parity Ethereum.
 // Parity Ethereum is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 3 of the License, or
 // (at your option) any later version.
 // Parity Ethereum is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 // You should have received a copy of the GNU General Public License
 // along with Parity Ethereum.  If not, see <http://www.gnu.org/licenses/>.
 use criterion::{Criterion, criterion_group, criterion_main, black_box, Throughput, BenchmarkId};
 use std::mem;
 use std::sync::atomic::{AtomicPtr, Ordering};
 use eip_152::portable;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 use eip_152::avx2;
 type FnRaw = *mut ();
 type Blake2bF = fn(&mut [u64; 8], [u64; 16], [u64; 2], bool, usize);
 static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw);
 fn detect(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
 	let fun = if is_x86_feature_detected!("avx2") {
 		avx2::compress as FnRaw
 	} else {
 		portable::compress as FnRaw
 	};
 	FN.store(fun as FnRaw, Ordering::Relaxed);
 	unsafe {
 		mem::transmute::<FnRaw, Blake2bF>(fun)(state, message, count, f, rounds)
 	}
 }
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub fn avx_ifunc_benchmark(c: &mut Criterion) {
 	let mut group = c.benchmark_group("avx2_ifunc");
 	for rounds in [12, 50, 100].iter() {
 		group.throughput(Throughput::Elements(*rounds as u64));
 		group.bench_with_input(
 			BenchmarkId::new("rounds", rounds),
 			&rounds,
 			|b, rounds| {
 				let mut state = [
 					0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
 					0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
 					0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
 					0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
 				];
 				let message = [
 					0x0000000000636261_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 				];
 				let count = [3, 0];
 				let f = true;
 				b.iter(|| {
 					unsafe {
 						let fun = FN.load(Ordering::Relaxed);
 						mem::transmute::<FnRaw, Blake2bF>
 							(fun)
 							(
 								black_box(&mut state),
 								black_box(message),
 								black_box(count),
 								black_box(f),
 								black_box(**rounds as usize),
 							);
 					}
 				});
 			},
 		);
 	}
 	group.finish();
 }
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub fn avx_benchmark(c: &mut Criterion) {
 	let mut group = c.benchmark_group("avx2");
 	for rounds in [12, 50, 100].iter() {
 		group.throughput(Throughput::Elements(*rounds as u64));
 		group.bench_with_input(
 			BenchmarkId::new("rounds", rounds),
 			&rounds,
 			|b, rounds| {
 				let mut state = [
 					0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
 					0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
 					0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
 					0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
 				];
 				let message = [
 					0x0000000000636261_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 				];
 				let count = [3, 0];
 				let f = true;
 				b.iter(|| {
 					unsafe {
 						avx2::compress(
 							black_box(&mut state),
 							black_box(message),
 							black_box(count),
 							black_box(f),
 							black_box(**rounds as usize),
 						);
 					}
 				});
 			},
 		);
 	}
 	group.finish();
 }
 pub fn portable_benchmark(c: &mut Criterion) {
 	let mut group = c.benchmark_group("portable_impl");
 	for rounds in [12, 50, 100].iter() {
 		group.throughput(Throughput::Elements(*rounds as u64));
 		group.bench_with_input(
 			BenchmarkId::new("rounds", rounds),
 			&rounds,
 			|b, rounds| {
 				let mut state = [
 					0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
 					0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
 					0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
 					0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
 				];
 				let message = [
 					0x0000000000636261_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 					0x0000000000000000_u64, 0x0000000000000000_u64,
 				];
 				let count = [3, 0];
 				let f = true;
 				b.iter(|| {
 					portable::compress(
 						black_box(&mut state),
 						black_box(message),
 						black_box(count),
 						black_box(f),
 						black_box(**rounds as usize),
 					);
 				});
 			},
 		);
 	}
 	group.finish();
 }
 criterion_group!(benches, avx_benchmark, avx_ifunc_benchmark, portable_benchmark);
 criterion_main!(benches);
--- a/util/EIP-152/src/avx2.rs
+++ b/util/EIP-152/src/avx2.rs
@ -0,0 +1,471 @@
 // Copyright 2015-2019 Parity Technologies (UK) Ltd.
 // This file is part of Parity Ethereum.
 // Parity Ethereum is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 3 of the License, or
 // (at your option) any later version.
 // Parity Ethereum is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 // You should have received a copy of the GNU General Public License
 // along with Parity Ethereum.  If not, see <http://www.gnu.org/licenses/>.
 //! AVX2 implementation of the blake2b compression function.
 use crate::IV;
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 use arrayref::{array_refs, mut_array_refs};
 // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
 macro_rules! _MM_SHUFFLE {
    ($z:expr, $y:expr, $x:expr, $w:expr) => {
        ($z << 6) | ($y << 4) | ($x << 2) | $w
    };
 }
 /// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
 /// Takes as an argument the state vector `state`, message block vector `message`, offset counter, final
 /// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
 /// parameter is modified by the function.
 ///
 /// `g1` only operates on `x` from the original g function.
 ///  ```
 /// fn portable_g1(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64) {
 ///		v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
 ///		v[d] = (v[d] ^ v[a]).rotate_right(32);
 ///		v[c] = v[c].wrapping_add(v[d]);
 ///		v[b] = (v[b] ^ v[c]).rotate_right(24);
 /// }
 /// ```
 ///
 /// `g2` only operates on `y` from the originial g function.
 /// ```
 /// fn portable_g2(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, y: u64) {
 ///		v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
 ///		v[d] = (v[d] ^ v[a]).rotate_right(16);
 ///		v[c] = v[c].wrapping_add(v[d]);
 ///		v[b] = (v[b] ^ v[c]).rotate_right(63);
 /// }
 /// ```
 ///
 /// Message mixing is done based on sigma values, for a given round.
 ///
 /// # Example
 ///
 /// `SIGMA` for round 1 i.e `SIGMA[0]` = `[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]`;
 /// ```
 ///  let s = &SIGMA[0 % 10];
 /// //        a, b, c, d,    x
 /// g(&mut v, 0, 4, 8 , 12, m[s[0]]);
 ///	g(&mut v, 1, 5, 9 , 13, m[s[2]]);
 ///	g(&mut v, 2, 6, 10, 14, m[s[4]]);
 ///	g(&mut v, 3, 7, 11, 15, m[s[6]]);
 ///
 /// let a = v[..4];
 /// let b = v[4..8];
 /// let c = v[8..12];
 /// let d = v[12..16];
 /// let mut b0 = [m[0], m[2], m[4], m[6]];
 ///
 ///  g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 /// // ... then contruct b0 for `g2` etc.
 /// ```
 ///
 #[target_feature(enable = "avx2")]
 pub unsafe fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
 	// get a mutable reference to state[0..4], state[4..]
 	let (state_low, state_high) = mut_array_refs!(state, 4, 4);
 	// get a reference to IV[0..4], IV[4..]
 	let (iv_low, iv_high) = array_refs!(&IV, 4, 4);
 	// loads them into an __m256i
 	let mut a = loadu(state_low);
 	let mut b = loadu(state_high);
 	let mut c = loadu(iv_low);
 	// !a = xor(a, xor(a, !a))
 	let inverse = if f {
 		iv_high[3] ^ !iv_high[3]
 	} else {
 		0
 	};
 	let flags = set4(
 		count[0],
 		count[1],
 		inverse,
 		0,
 	);
 	let mut d = xor(loadu(iv_high), flags);
 	// get a reference to message[(0..2)+,]
 	let msg_chunks = array_refs!(&message, 2, 2, 2, 2, 2, 2, 2, 2);
 	// load each message [u64; 2] into an __m128i, broadcast it into both lanes of an __m256i.
 	// m0 = __m256i([message[0], message[1], message[0], message[1]])
 	let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
 	// m1 = __m256i([message[2], message[3], message[2], message[3]])
 	let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
 	// m2 = __m256i([message[4], message[5], message[4], message[5]])
 	let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
 	// m3 = __m256i([message[6], message[7], message[6], message[7]])
 	let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
 	// m4 = __m256i([message[8], message[9], message[8], message[9]])
 	let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
 	// m5 = __m256i([message[10], message[11], message[10], message[11]])
 	let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
 	// m6 = __m256i([message[12], message[13], message[12], message[13]])
 	let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
 	// m7 = __m256i([message[14], message[15], message[14], message[15]])
 	let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));
 	let iv0 = a;
 	let iv1 = b;
 	let mut t0;
 	let mut t1;
 	let mut b0;
 	for i in 0..rounds {
 		match i % 10 {
 			0 => {
 				t0 = _mm256_unpacklo_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [0, 2, 0, 2]
 				t1 = _mm256_unpacklo_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [4, 6, 4, 6]
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([0, 2, 0, 2], [4, 6, 4, 6]) = [0, 2, 4, 6]
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpackhi_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [1, 3, 1, 3]
 				t1 = _mm256_unpackhi_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [5, 7, 5, 7]
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([1, 3, 1, 3], [5, 7, 5, 7]) = [1, 3, 5, 7]
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				diagonalize(&mut a, &mut b, &mut c, &mut d);
 				t0 = _mm256_unpacklo_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [14, 8, 14, 8]
 				t1 = _mm256_unpacklo_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [10, 12, 10, 12]
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([14, 8, 14, 8], [10, 12, 10, 12]) = [14, 8, 10, 12]
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpackhi_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [15, 9, 15, 9]
 				t1 = _mm256_unpackhi_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [11, 13, 11, 13]
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([15, 9, 15, 9], [11, 13, 11, 13]) = [15, 9, 11, 13]
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				undiagonalize(&mut a, &mut b, &mut c, &mut d);
 			}
 			1 => {
 				t0 = _mm256_unpacklo_epi64(m7, m2);
 				t1 = _mm256_unpackhi_epi64(m4, m6);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpacklo_epi64(m5, m4);
 				t1 = _mm256_alignr_epi8(m3, m7, 8);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				diagonalize(&mut a, &mut b, &mut c, &mut d);
 				t0 = _mm256_unpackhi_epi64(m2, m0);
 				t1 = _mm256_blend_epi32(m5, m0, 0x33);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_alignr_epi8(m6, m1, 8);
 				t1 = _mm256_blend_epi32(m3, m1, 0x33);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				undiagonalize(&mut a, &mut b, &mut c, &mut d);
 			}
 			2 => {
 				// round 3
 				t0 = _mm256_alignr_epi8(m6, m5, 8);
 				t1 = _mm256_unpackhi_epi64(m2, m7);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpacklo_epi64(m4, m0);
 				t1 = _mm256_blend_epi32(m6, m1, 0x33);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				diagonalize(&mut a, &mut b, &mut c, &mut d);
 				t0 = _mm256_alignr_epi8(m5, m4, 8);
 				t1 = _mm256_unpackhi_epi64(m1, m3);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpacklo_epi64(m2, m7);
 				t1 = _mm256_blend_epi32(m0, m3, 0x33);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				undiagonalize(&mut a, &mut b, &mut c, &mut d);
 			}
 			3 => {
 				// round 4
 				t0 = _mm256_unpackhi_epi64(m3, m1);
 				t1 = _mm256_unpackhi_epi64(m6, m5);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpackhi_epi64(m4, m0);
 				t1 = _mm256_unpacklo_epi64(m6, m7);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				diagonalize(&mut a, &mut b, &mut c, &mut d);
 				t0 = _mm256_alignr_epi8(m1, m7, 8);
 				t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpacklo_epi64(m4, m3);
 				t1 = _mm256_unpacklo_epi64(m5, m0);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				undiagonalize(&mut a, &mut b, &mut c, &mut d);
 			}
 			4 => {
 				// round 5
 				t0 = _mm256_unpackhi_epi64(m4, m2);
 				t1 = _mm256_unpacklo_epi64(m1, m5);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_blend_epi32(m3, m0, 0x33);
 				t1 = _mm256_blend_epi32(m7, m2, 0x33);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				diagonalize(&mut a, &mut b, &mut c, &mut d);
 				t0 = _mm256_alignr_epi8(m7, m1, 8);
 				t1 = _mm256_alignr_epi8(m3, m5, 8);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpackhi_epi64(m6, m0);
 				t1 = _mm256_unpacklo_epi64(m6, m4);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				undiagonalize(&mut a, &mut b, &mut c, &mut d);
 			}
 			5 => {
 				// round 6
 				t0 = _mm256_unpacklo_epi64(m1, m3);
 				t1 = _mm256_unpacklo_epi64(m0, m4);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpacklo_epi64(m6, m5);
 				t1 = _mm256_unpackhi_epi64(m5, m1);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				diagonalize(&mut a, &mut b, &mut c, &mut d);
 				t0 = _mm256_alignr_epi8(m2, m0, 8);
 				t1 = _mm256_unpackhi_epi64(m3, m7);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpackhi_epi64(m4, m6);
 				t1 = _mm256_alignr_epi8(m7, m2, 8);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				undiagonalize(&mut a, &mut b, &mut c, &mut d);
 			}
 			6 => {
 				// round 7
 				t0 = _mm256_blend_epi32(m0, m6, 0x33);
 				t1 = _mm256_unpacklo_epi64(m7, m2);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpackhi_epi64(m2, m7);
 				t1 = _mm256_alignr_epi8(m5, m6, 8);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				diagonalize(&mut a, &mut b, &mut c, &mut d);
 				t0 = _mm256_unpacklo_epi64(m4, m0);
 				t1 = _mm256_blend_epi32(m4, m3, 0x33);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpackhi_epi64(m5, m3);
 				t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				undiagonalize(&mut a, &mut b, &mut c, &mut d);
 			}
 			7 => {
 				// round 8
 				t0 = _mm256_unpackhi_epi64(m6, m3);
 				t1 = _mm256_blend_epi32(m1, m6, 0x33);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_alignr_epi8(m7, m5, 8);
 				t1 = _mm256_unpackhi_epi64(m0, m4);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				diagonalize(&mut a, &mut b, &mut c, &mut d);
 				t0 = _mm256_blend_epi32(m2, m1, 0x33);
 				t1 = _mm256_alignr_epi8(m4, m7, 8);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpacklo_epi64(m5, m0);
 				t1 = _mm256_unpacklo_epi64(m2, m3);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				undiagonalize(&mut a, &mut b, &mut c, &mut d);
 			}
 			8 => {
 				// round 9
 				t0 = _mm256_unpacklo_epi64(m3, m7);
 				t1 = _mm256_alignr_epi8(m0, m5, 8);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpackhi_epi64(m7, m4);
 				t1 = _mm256_alignr_epi8(m4, m1, 8);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				diagonalize(&mut a, &mut b, &mut c, &mut d);
 				t0 = _mm256_unpacklo_epi64(m5, m6);
 				t1 = _mm256_unpackhi_epi64(m6, m0);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_alignr_epi8(m1, m2, 8);
 				t1 = _mm256_alignr_epi8(m2, m3, 8);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				undiagonalize(&mut a, &mut b, &mut c, &mut d);
 			}
 			_ => {
 				// round 10
 				t0 = _mm256_unpacklo_epi64(m5, m4);
 				t1 = _mm256_unpackhi_epi64(m3, m0);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_unpacklo_epi64(m1, m2);
 				t1 = _mm256_blend_epi32(m2, m3, 0x33);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				diagonalize(&mut a, &mut b, &mut c, &mut d);
 				t0 = _mm256_unpackhi_epi64(m6, m7);
 				t1 = _mm256_unpackhi_epi64(m4, m1);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				t0 = _mm256_blend_epi32(m5, m0, 0x33);
 				t1 = _mm256_unpacklo_epi64(m7, m6);
 				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
 				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
 				undiagonalize(&mut a, &mut b, &mut c, &mut d);
 			}
 		}
 	}
 	a = xor(a, c);
 	b = xor(b, d);
 	a = xor(a, iv0);
 	b = xor(b, iv1);
 	storeu(a, state_low);
 	storeu(b, state_high);
 }
 #[inline(always)]
 unsafe fn loadu(src: *const [u64; 4]) -> __m256i {
 	// This is an unaligned load, so the pointer cast is allowed.
 	_mm256_loadu_si256(src as *const __m256i)
 }
 #[inline(always)]
 unsafe fn storeu(src: __m256i, dest: *mut [u64; 4]) {
 	// This is an unaligned store, so the pointer cast is allowed.
 	_mm256_storeu_si256(dest as *mut __m256i, src)
 }
 #[inline(always)]
 unsafe fn loadu_128(mem_addr: &[u64; 2]) -> __m128i {
 	_mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
 }
 #[inline(always)]
 unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
 	_mm256_add_epi64(a, b)
 }
 #[inline(always)]
 unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
 	_mm256_xor_si256(a, b)
 }
 #[inline(always)]
 unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
 	_mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
 }
 #[inline(always)]
 unsafe fn rotate_right_32(x: __m256i) -> __m256i {
 	_mm256_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1))
 }
 #[inline(always)]
 unsafe fn rotate_right_24(x: __m256i) -> __m256i {
 	let rotate24 = _mm256_setr_epi8(
 		3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13,
 		14, 15, 8, 9, 10,
 	);
 	_mm256_shuffle_epi8(x, rotate24)
 }
 #[inline(always)]
 unsafe fn rotate_right_16(x: __m256i) -> __m256i {
 	let rotate16 = _mm256_setr_epi8(
 		2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12,
 		13, 14, 15, 8, 9,
 	);
 	_mm256_shuffle_epi8(x, rotate16)
 }
 #[inline(always)]
 unsafe fn rotate_right_63(x: __m256i) -> __m256i {
 	_mm256_or_si256(_mm256_srli_epi64(x, 63), add(x, x))
 }
 #[inline(always)]
 unsafe fn g1(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
 	*a = add(*a, *m);
 	*a = add(*a, *b);
 	*d = xor(*d, *a);
 	*d = rotate_right_32(*d);
 	*c = add(*c, *d);
 	*b = xor(*b, *c);
 	*b = rotate_right_24(*b);
 }
 #[inline(always)]
 unsafe fn g2(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
 	*a = add(*a, *m);
 	*a = add(*a, *b);
 	*d = xor(*d, *a);
 	*d = rotate_right_16(*d);
 	*c = add(*c, *d);
 	*b = xor(*b, *c);
 	*b = rotate_right_63(*b);
 }
 // Note the optimization here of leaving b as the unrotated row, rather than a.
 // All the message loads below are adjusted to compensate for this. See
 // discussion at https://github.com/sneves/blake2-avx2/pull/4
 #[inline(always)]
 unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
 	*a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
 	*d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
 	*c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
 }
 // Note the optimization here of leaving b as the unrotated row, rather than a.
 // All the message loads below are adjusted to compensate for this. See
 // discussion at https://github.com/sneves/blake2-avx2/pull/4
 #[inline(always)]
 unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
 	*a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
 	*d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
 	*c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
 }
 #[cfg(test)]
 mod tests {
 	#[test]
 	fn test_mm_shuffle() {
 		assert_eq!(_MM_SHUFFLE!(0, 1, 1, 3), 0b00_01_01_11);
 		assert_eq!(_MM_SHUFFLE!(3, 1, 1, 0), 0b11_01_01_00);
 		assert_eq!(_MM_SHUFFLE!(1, 2, 2, 1), 0b01_10_10_01);
 	}
 }
--- a/util/EIP-152/src/lib.rs
+++ b/util/EIP-152/src/lib.rs
@ -14,20 +14,24 @@
 // You should have received a copy of the GNU General Public License
 // along with Parity Ethereum.  If not, see <http://www.gnu.org/licenses/>.
 pub mod portable;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub mod avx2;
 /// The precomputed values for BLAKE2b [from the spec](https://tools.ietf.org/html/rfc7693#section-2.7)
 /// There are 10 16-byte arrays - one for each round
 /// the entries are calculated from the sigma constants.
 const SIGMA: [[usize; 16]; 10] = [
-	[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
+	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-	[14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3],
+	[14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
-	[11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4],
+	[11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
-	[ 7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8],
+	[7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
-	[ 9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13],
+	[9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
-	[ 2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9],
+	[2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
-	[12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11],
+	[12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
-	[13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10],
+	[13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
-	[ 6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5],
+	[6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
-	[10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0],
+	[10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
 ];
@ -38,58 +42,30 @@ const IV: [u64; 8] = [
 	0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
 ];
-
+/// blake2b compression function
-#[inline(always)]
+pub fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
-/// The G mixing function. See https://tools.ietf.org/html/rfc7693#section-3.1
+	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
+	{
-	v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
+		if is_x86_feature_detected!("avx2") {
-	v[d] = (v[d] ^ v[a]).rotate_right(32);
+			unsafe {
-	v[c] = v[c].wrapping_add(v[d]);
+				return avx2::compress(state, message, count, f, rounds)
-	v[b] = (v[b] ^ v[c]).rotate_right(24);
+			}
-	v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
+		} else {
-	v[d] = (v[d] ^ v[a]).rotate_right(16);
+			return portable::compress(state, message, count, f, rounds)
-	v[c] = v[c].wrapping_add(v[d]);
+		};
 	v[b] = (v[b] ^ v[c]).rotate_right(63);
 }
 /// The Blake2 compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
 /// Takes as an argument the state vector `h`, message block vector `m`, offset counter `t`, final
 /// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
 /// parameter is modified by the function.
 pub fn compress(h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool, rounds: usize) {
 	let mut v = [0u64; 16];
 	v[..h.len()].copy_from_slice(h);    // First half from state.
 	v[h.len()..].copy_from_slice(&IV);  // Second half from IV.
 	v[12] ^= t[0];
 	v[13] ^= t[1];
 	if f {
 		v[14] = !v[14] // Invert all bits if the last-block-flag is set.
 	}
 	for i in 0..rounds {
 		// Message word selection permutation for this round.
 		let s = &SIGMA[i % 10];
 		g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
 		g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
 		g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
 		g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
 		g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
 		g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
 		g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
 		g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
 	}
-	for i in 0..8 {
+	#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
-		h[i] ^= v[i] ^ v[i + 8];
+	portable::compress(state, message, count, f, rounds);
 	}
 }
 #[cfg(test)]
 mod tests {
-	use crate::compress;
+	use crate::portable;
 	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 	use crate::avx2;
 	use rustc_hex::FromHex;
 	#[test]
@ -119,9 +95,27 @@ mod tests {
 			0x5A92F1DBA88AD318_u64, 0x239900D4ED8623B9_u64,
 		];
-		compress(&mut h_in, m, c, f, rounds);
+		// portable
-
+		portable::compress(&mut h_in, m, c, f, rounds);
 		assert_eq!(h_in, h_out);
 		let mut h_in = [
 			0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
 			0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
 			0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
 			0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
 		];
 		// avx
 		#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 		{
 			if is_x86_feature_detected!("avx2") {
 				unsafe {
 					avx2::compress(&mut h_in, m, c, f, rounds);
 					assert_eq!(h_in, h_out);
 				}
 			}
 		}
 	}
 	fn to_u64_slice(vec: &[u8], slice: &mut [u64]) {
@ -130,6 +124,7 @@ mod tests {
 		})
 	}
 	#[test]
 	fn test_vectors_from_eip() {
 		let vec = vec![
@ -178,15 +173,27 @@ mod tests {
 			to_u64_slice(&bytes[4..68], &mut h);
 			to_u64_slice(&bytes[68..196], &mut m);
 			to_u64_slice(&bytes[196..212], &mut t);
 			compress(&mut h, m, t, f, rounds as usize);
 			let output: Vec<u8> = output.from_hex().unwrap();
 			let mut out = [0u64; 8];
 			to_u64_slice(&output[..], &mut out);
-			assert_eq!(out, h);
+			#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 			{
 				// avx
 				if is_x86_feature_detected!("avx2") {
 					unsafe {
 						avx2::compress(&mut h, m, t, f, rounds as usize);
 						assert_eq!(out, h);
 					}
 				}
 			}
 			{
 				// portable
 				to_u64_slice(&bytes[4..68], &mut h);
 				portable::compress(&mut h, m, t, f, rounds as usize);
 				assert_eq!(out, h);
 			}
 		}
 	}
 }
--- a/util/EIP-152/src/portable.rs
+++ b/util/EIP-152/src/portable.rs
@ -0,0 +1,67 @@
 // Copyright 2015-2019 Parity Technologies (UK) Ltd.
 // This file is part of Parity Ethereum.
 // Parity Ethereum is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 3 of the License, or
 // (at your option) any later version.
 // Parity Ethereum is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 // You should have received a copy of the GNU General Public License
 // along with Parity Ethereum.  If not, see <http://www.gnu.org/licenses/>.
 //! Portable implementation of the blake2b compress function
 use crate::{IV, SIGMA};
 /// The G mixing function. See https://tools.ietf.org/html/rfc7693#section-3.1
 #[inline(always)]
 fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
 	v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
 	v[d] = (v[d] ^ v[a]).rotate_right(32);
 	v[c] = v[c].wrapping_add(v[d]);
 	v[b] = (v[b] ^ v[c]).rotate_right(24);
 	v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
 	v[d] = (v[d] ^ v[a]).rotate_right(16);
 	v[c] = v[c].wrapping_add(v[d]);
 	v[b] = (v[b] ^ v[c]).rotate_right(63);
 }
 /// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
 /// Takes as an argument the state vector `h`, message block vector `m`, offset counter `t`, final
 /// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
 /// parameter is modified by the function.
 pub fn compress(h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool, rounds: usize) {
 	let mut v = [0u64; 16];
 	v[..8].copy_from_slice(h);    // First half from state.
 	v[8..].copy_from_slice(&IV);  // Second half from IV.
 	v[12] ^= t[0];
 	v[13] ^= t[1];
 	if f {
 		v[14] = !v[14]; // Invert all bits if the last-block-flag is set.
 	}
 	for i in 0..rounds {
 		// Message word selection permutation for this round.
 		let s = &SIGMA[i % 10];
 		g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
 		g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
 		g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
 		g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
 		g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
 		g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
 		g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
 		g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
 	}
 	for i in 0..8 {
 		h[i] ^= v[i] ^ v[i + 8];
 	}
 }