SIMD Implementation for EIP-152 (#11056)

* simd implementation * adds benchmarks * Update util/EIP-152/src/avx.rs Co-Authored-By: David <dvdplm@gmail.com> * enable avx 😅 * better benchmark, docs * rename avx to avx2, compile on android * fix android again * remove ifunc, code formatting * license * nits * docs, nits * fix test
2019-10-02 14:32:21 +01:00 · 2019-10-02 14:32:21 +01:00 · f24bff5998
commit f24bff5998
parent ad633de6d9
7 changed files with 833 additions and 62 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -810,6 +810,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 name = "eip-152"
 version = "0.1.0"
 dependencies = [
+ "arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
+ "criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "rustc-hex 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

--- a/util/EIP-152/Cargo.toml
+++ b/util/EIP-152/Cargo.toml
@ -12,3 +12,11 @@ edition = "2018"

 [dependencies]
 rustc-hex = "2.0.1"
+arrayref = "0.3.5"
+
+[dev-dependencies]
+criterion = "0.3"
+
+[[bench]]
+name = "bench"
+harness = false
--- a/util/EIP-152/LICENSE
+++ b/util/EIP-152/LICENSE
@ -0,0 +1,25 @@
+This program is copyright 2019 Parity Technologies Limited and its licensors.
+
+                         GNU GENERAL PUBLIC LICENSE
+                            Version 3, 29 June 2007
+
+Some portions of the program (“the Software”) are Copyright (c) 2018 Jack O'Connor
+and the following relates solely to such portions:
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/util/EIP-152/benches/bench.rs
+++ b/util/EIP-152/benches/bench.rs
@ -0,0 +1,191 @@
+// Copyright 2015-2019 Parity Technologies (UK) Ltd.
+// This file is part of Parity Ethereum.
+
+// Parity Ethereum is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Parity Ethereum is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Parity Ethereum.  If not, see <http://www.gnu.org/licenses/>.
+
+
+use criterion::{Criterion, criterion_group, criterion_main, black_box, Throughput, BenchmarkId};
+use std::mem;
+use std::sync::atomic::{AtomicPtr, Ordering};
+use eip_152::portable;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use eip_152::avx2;
+
+type FnRaw = *mut ();
+type Blake2bF = fn(&mut [u64; 8], [u64; 16], [u64; 2], bool, usize);
+
+static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw);
+
+fn detect(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
+	let fun = if is_x86_feature_detected!("avx2") {
+		avx2::compress as FnRaw
+	} else {
+		portable::compress as FnRaw
+	};
+	FN.store(fun as FnRaw, Ordering::Relaxed);
+	unsafe {
+		mem::transmute::<FnRaw, Blake2bF>(fun)(state, message, count, f, rounds)
+	}
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub fn avx_ifunc_benchmark(c: &mut Criterion) {
+	let mut group = c.benchmark_group("avx2_ifunc");
+
+	for rounds in [12, 50, 100].iter() {
+		group.throughput(Throughput::Elements(*rounds as u64));
+		group.bench_with_input(
+			BenchmarkId::new("rounds", rounds),
+			&rounds,
+			|b, rounds| {
+				let mut state = [
+					0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
+					0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
+					0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
+					0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
+				];
+
+				let message = [
+					0x0000000000636261_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+				];
+				let count = [3, 0];
+				let f = true;
+
+				b.iter(|| {
+					unsafe {
+						let fun = FN.load(Ordering::Relaxed);
+						mem::transmute::<FnRaw, Blake2bF>
+							(fun)
+							(
+								black_box(&mut state),
+								black_box(message),
+								black_box(count),
+								black_box(f),
+								black_box(**rounds as usize),
+							);
+					}
+				});
+			},
+		);
+	}
+
+	group.finish();
+}
+
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub fn avx_benchmark(c: &mut Criterion) {
+	let mut group = c.benchmark_group("avx2");
+
+	for rounds in [12, 50, 100].iter() {
+		group.throughput(Throughput::Elements(*rounds as u64));
+		group.bench_with_input(
+			BenchmarkId::new("rounds", rounds),
+			&rounds,
+			|b, rounds| {
+				let mut state = [
+					0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
+					0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
+					0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
+					0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
+				];
+
+				let message = [
+					0x0000000000636261_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+				];
+				let count = [3, 0];
+				let f = true;
+
+				b.iter(|| {
+
+					unsafe {
+						avx2::compress(
+							black_box(&mut state),
+							black_box(message),
+							black_box(count),
+							black_box(f),
+							black_box(**rounds as usize),
+						);
+					}
+				});
+			},
+		);
+	}
+
+	group.finish();
+}
+
+
+pub fn portable_benchmark(c: &mut Criterion) {
+	let mut group = c.benchmark_group("portable_impl");
+
+	for rounds in [12, 50, 100].iter() {
+		group.throughput(Throughput::Elements(*rounds as u64));
+		group.bench_with_input(
+			BenchmarkId::new("rounds", rounds),
+			&rounds,
+			|b, rounds| {
+				let mut state = [
+					0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
+					0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
+					0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
+					0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
+				];
+
+				let message = [
+					0x0000000000636261_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+					0x0000000000000000_u64, 0x0000000000000000_u64,
+				];
+				let count = [3, 0];
+				let f = true;
+
+				b.iter(|| {
+					portable::compress(
+						black_box(&mut state),
+						black_box(message),
+						black_box(count),
+						black_box(f),
+						black_box(**rounds as usize),
+					);
+				});
+			},
+		);
+	}
+
+	group.finish();
+}
+
+criterion_group!(benches, avx_benchmark, avx_ifunc_benchmark, portable_benchmark);
+criterion_main!(benches);
--- a/util/EIP-152/src/avx2.rs
+++ b/util/EIP-152/src/avx2.rs
@ -0,0 +1,471 @@
+// Copyright 2015-2019 Parity Technologies (UK) Ltd.
+// This file is part of Parity Ethereum.
+
+// Parity Ethereum is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Parity Ethereum is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Parity Ethereum.  If not, see <http://www.gnu.org/licenses/>.
+
+//! AVX2 implementation of the blake2b compression function.
+use crate::IV;
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+use arrayref::{array_refs, mut_array_refs};
+
+// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
+macro_rules! _MM_SHUFFLE {
+    ($z:expr, $y:expr, $x:expr, $w:expr) => {
+        ($z << 6) | ($y << 4) | ($x << 2) | $w
+    };
+}
+
+/// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
+/// Takes as an argument the state vector `state`, message block vector `message`, offset counter, final
+/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
+/// parameter is modified by the function.
+///
+/// `g1` only operates on `x` from the original g function.
+///  ```
+/// fn portable_g1(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64) {
+///		v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
+///		v[d] = (v[d] ^ v[a]).rotate_right(32);
+///		v[c] = v[c].wrapping_add(v[d]);
+///		v[b] = (v[b] ^ v[c]).rotate_right(24);
+/// }
+/// ```
+///
+/// `g2` only operates on `y` from the originial g function.
+/// ```
+/// fn portable_g2(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, y: u64) {
+///		v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
+///		v[d] = (v[d] ^ v[a]).rotate_right(16);
+///		v[c] = v[c].wrapping_add(v[d]);
+///		v[b] = (v[b] ^ v[c]).rotate_right(63);
+/// }
+/// ```
+///
+/// Message mixing is done based on sigma values, for a given round.
+///
+/// # Example
+///
+/// `SIGMA` for round 1 i.e `SIGMA[0]` = `[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]`;
+/// ```
+///  let s = &SIGMA[0 % 10];
+/// //        a, b, c, d,    x
+/// g(&mut v, 0, 4, 8 , 12, m[s[0]]);
+///	g(&mut v, 1, 5, 9 , 13, m[s[2]]);
+///	g(&mut v, 2, 6, 10, 14, m[s[4]]);
+///	g(&mut v, 3, 7, 11, 15, m[s[6]]);
+///
+/// let a = v[..4];
+/// let b = v[4..8];
+/// let c = v[8..12];
+/// let d = v[12..16];
+/// let mut b0 = [m[0], m[2], m[4], m[6]];
+///
+///  g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+/// // ... then contruct b0 for `g2` etc.
+/// ```
+///
+#[target_feature(enable = "avx2")]
+pub unsafe fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
+	// get a mutable reference to state[0..4], state[4..]
+	let (state_low, state_high) = mut_array_refs!(state, 4, 4);
+	// get a reference to IV[0..4], IV[4..]
+	let (iv_low, iv_high) = array_refs!(&IV, 4, 4);
+
+	// loads them into an __m256i
+	let mut a = loadu(state_low);
+	let mut b = loadu(state_high);
+	let mut c = loadu(iv_low);
+
+	// !a = xor(a, xor(a, !a))
+	let inverse = if f {
+		iv_high[3] ^ !iv_high[3]
+	} else {
+		0
+	};
+
+	let flags = set4(
+		count[0],
+		count[1],
+		inverse,
+		0,
+	);
+
+	let mut d = xor(loadu(iv_high), flags);
+
+	// get a reference to message[(0..2)+,]
+	let msg_chunks = array_refs!(&message, 2, 2, 2, 2, 2, 2, 2, 2);
+	// load each message [u64; 2] into an __m128i, broadcast it into both lanes of an __m256i.
+
+	// m0 = __m256i([message[0], message[1], message[0], message[1]])
+	let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
+	// m1 = __m256i([message[2], message[3], message[2], message[3]])
+	let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
+	// m2 = __m256i([message[4], message[5], message[4], message[5]])
+	let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
+	// m3 = __m256i([message[6], message[7], message[6], message[7]])
+	let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
+	// m4 = __m256i([message[8], message[9], message[8], message[9]])
+	let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
+	// m5 = __m256i([message[10], message[11], message[10], message[11]])
+	let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
+	// m6 = __m256i([message[12], message[13], message[12], message[13]])
+	let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
+	// m7 = __m256i([message[14], message[15], message[14], message[15]])
+	let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));
+
+	let iv0 = a;
+	let iv1 = b;
+
+	let mut t0;
+	let mut t1;
+	let mut b0;
+
+	for i in 0..rounds {
+		match i % 10 {
+			0 => {
+				t0 = _mm256_unpacklo_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [0, 2, 0, 2]
+				t1 = _mm256_unpacklo_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [4, 6, 4, 6]
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([0, 2, 0, 2], [4, 6, 4, 6]) = [0, 2, 4, 6]
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpackhi_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [1, 3, 1, 3]
+				t1 = _mm256_unpackhi_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [5, 7, 5, 7]
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([1, 3, 1, 3], [5, 7, 5, 7]) = [1, 3, 5, 7]
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				diagonalize(&mut a, &mut b, &mut c, &mut d);
+				t0 = _mm256_unpacklo_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [14, 8, 14, 8]
+				t1 = _mm256_unpacklo_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [10, 12, 10, 12]
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([14, 8, 14, 8], [10, 12, 10, 12]) = [14, 8, 10, 12]
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpackhi_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [15, 9, 15, 9]
+				t1 = _mm256_unpackhi_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [11, 13, 11, 13]
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([15, 9, 15, 9], [11, 13, 11, 13]) = [15, 9, 11, 13]
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				undiagonalize(&mut a, &mut b, &mut c, &mut d);
+			}
+			1 => {
+				t0 = _mm256_unpacklo_epi64(m7, m2);
+				t1 = _mm256_unpackhi_epi64(m4, m6);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpacklo_epi64(m5, m4);
+				t1 = _mm256_alignr_epi8(m3, m7, 8);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				diagonalize(&mut a, &mut b, &mut c, &mut d);
+				t0 = _mm256_unpackhi_epi64(m2, m0);
+				t1 = _mm256_blend_epi32(m5, m0, 0x33);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_alignr_epi8(m6, m1, 8);
+				t1 = _mm256_blend_epi32(m3, m1, 0x33);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				undiagonalize(&mut a, &mut b, &mut c, &mut d);
+			}
+			2 => {
+				// round 3
+				t0 = _mm256_alignr_epi8(m6, m5, 8);
+				t1 = _mm256_unpackhi_epi64(m2, m7);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpacklo_epi64(m4, m0);
+				t1 = _mm256_blend_epi32(m6, m1, 0x33);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				diagonalize(&mut a, &mut b, &mut c, &mut d);
+				t0 = _mm256_alignr_epi8(m5, m4, 8);
+				t1 = _mm256_unpackhi_epi64(m1, m3);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpacklo_epi64(m2, m7);
+				t1 = _mm256_blend_epi32(m0, m3, 0x33);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				undiagonalize(&mut a, &mut b, &mut c, &mut d);
+			}
+			3 => {
+				// round 4
+				t0 = _mm256_unpackhi_epi64(m3, m1);
+				t1 = _mm256_unpackhi_epi64(m6, m5);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpackhi_epi64(m4, m0);
+				t1 = _mm256_unpacklo_epi64(m6, m7);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				diagonalize(&mut a, &mut b, &mut c, &mut d);
+				t0 = _mm256_alignr_epi8(m1, m7, 8);
+				t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpacklo_epi64(m4, m3);
+				t1 = _mm256_unpacklo_epi64(m5, m0);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				undiagonalize(&mut a, &mut b, &mut c, &mut d);
+			}
+			4 => {
+				// round 5
+				t0 = _mm256_unpackhi_epi64(m4, m2);
+				t1 = _mm256_unpacklo_epi64(m1, m5);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_blend_epi32(m3, m0, 0x33);
+				t1 = _mm256_blend_epi32(m7, m2, 0x33);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				diagonalize(&mut a, &mut b, &mut c, &mut d);
+				t0 = _mm256_alignr_epi8(m7, m1, 8);
+				t1 = _mm256_alignr_epi8(m3, m5, 8);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpackhi_epi64(m6, m0);
+				t1 = _mm256_unpacklo_epi64(m6, m4);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				undiagonalize(&mut a, &mut b, &mut c, &mut d);
+			}
+			5 => {
+				// round 6
+				t0 = _mm256_unpacklo_epi64(m1, m3);
+				t1 = _mm256_unpacklo_epi64(m0, m4);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpacklo_epi64(m6, m5);
+				t1 = _mm256_unpackhi_epi64(m5, m1);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				diagonalize(&mut a, &mut b, &mut c, &mut d);
+				t0 = _mm256_alignr_epi8(m2, m0, 8);
+				t1 = _mm256_unpackhi_epi64(m3, m7);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpackhi_epi64(m4, m6);
+				t1 = _mm256_alignr_epi8(m7, m2, 8);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				undiagonalize(&mut a, &mut b, &mut c, &mut d);
+			}
+			6 => {
+				// round 7
+				t0 = _mm256_blend_epi32(m0, m6, 0x33);
+				t1 = _mm256_unpacklo_epi64(m7, m2);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpackhi_epi64(m2, m7);
+				t1 = _mm256_alignr_epi8(m5, m6, 8);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				diagonalize(&mut a, &mut b, &mut c, &mut d);
+				t0 = _mm256_unpacklo_epi64(m4, m0);
+				t1 = _mm256_blend_epi32(m4, m3, 0x33);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpackhi_epi64(m5, m3);
+				t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				undiagonalize(&mut a, &mut b, &mut c, &mut d);
+			}
+			7 => {
+				// round 8
+				t0 = _mm256_unpackhi_epi64(m6, m3);
+				t1 = _mm256_blend_epi32(m1, m6, 0x33);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_alignr_epi8(m7, m5, 8);
+				t1 = _mm256_unpackhi_epi64(m0, m4);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				diagonalize(&mut a, &mut b, &mut c, &mut d);
+				t0 = _mm256_blend_epi32(m2, m1, 0x33);
+				t1 = _mm256_alignr_epi8(m4, m7, 8);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpacklo_epi64(m5, m0);
+				t1 = _mm256_unpacklo_epi64(m2, m3);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				undiagonalize(&mut a, &mut b, &mut c, &mut d);
+			}
+			8 => {
+				// round 9
+				t0 = _mm256_unpacklo_epi64(m3, m7);
+				t1 = _mm256_alignr_epi8(m0, m5, 8);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpackhi_epi64(m7, m4);
+				t1 = _mm256_alignr_epi8(m4, m1, 8);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				diagonalize(&mut a, &mut b, &mut c, &mut d);
+				t0 = _mm256_unpacklo_epi64(m5, m6);
+				t1 = _mm256_unpackhi_epi64(m6, m0);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_alignr_epi8(m1, m2, 8);
+				t1 = _mm256_alignr_epi8(m2, m3, 8);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				undiagonalize(&mut a, &mut b, &mut c, &mut d);
+			}
+			_ => {
+				// round 10
+				t0 = _mm256_unpacklo_epi64(m5, m4);
+				t1 = _mm256_unpackhi_epi64(m3, m0);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_unpacklo_epi64(m1, m2);
+				t1 = _mm256_blend_epi32(m2, m3, 0x33);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				diagonalize(&mut a, &mut b, &mut c, &mut d);
+				t0 = _mm256_unpackhi_epi64(m6, m7);
+				t1 = _mm256_unpackhi_epi64(m4, m1);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				t0 = _mm256_blend_epi32(m5, m0, 0x33);
+				t1 = _mm256_unpacklo_epi64(m7, m6);
+				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+				undiagonalize(&mut a, &mut b, &mut c, &mut d);
+			}
+		}
+	}
+
+	a = xor(a, c);
+	b = xor(b, d);
+	a = xor(a, iv0);
+	b = xor(b, iv1);
+
+	storeu(a, state_low);
+	storeu(b, state_high);
+}
+
+
+#[inline(always)]
+unsafe fn loadu(src: *const [u64; 4]) -> __m256i {
+	// This is an unaligned load, so the pointer cast is allowed.
+	_mm256_loadu_si256(src as *const __m256i)
+}
+
+#[inline(always)]
+unsafe fn storeu(src: __m256i, dest: *mut [u64; 4]) {
+	// This is an unaligned store, so the pointer cast is allowed.
+	_mm256_storeu_si256(dest as *mut __m256i, src)
+}
+
+#[inline(always)]
+unsafe fn loadu_128(mem_addr: &[u64; 2]) -> __m128i {
+	_mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
+}
+
+#[inline(always)]
+unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
+	_mm256_add_epi64(a, b)
+}
+
+#[inline(always)]
+unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
+	_mm256_xor_si256(a, b)
+}
+
+#[inline(always)]
+unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
+	_mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
+}
+
+#[inline(always)]
+unsafe fn rotate_right_32(x: __m256i) -> __m256i {
+	_mm256_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1))
+}
+
+#[inline(always)]
+unsafe fn rotate_right_24(x: __m256i) -> __m256i {
+	let rotate24 = _mm256_setr_epi8(
+		3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13,
+		14, 15, 8, 9, 10,
+	);
+	_mm256_shuffle_epi8(x, rotate24)
+}
+
+#[inline(always)]
+unsafe fn rotate_right_16(x: __m256i) -> __m256i {
+	let rotate16 = _mm256_setr_epi8(
+		2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12,
+		13, 14, 15, 8, 9,
+	);
+	_mm256_shuffle_epi8(x, rotate16)
+}
+
+#[inline(always)]
+unsafe fn rotate_right_63(x: __m256i) -> __m256i {
+	_mm256_or_si256(_mm256_srli_epi64(x, 63), add(x, x))
+}
+
+#[inline(always)]
+unsafe fn g1(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
+	*a = add(*a, *m);
+	*a = add(*a, *b);
+	*d = xor(*d, *a);
+	*d = rotate_right_32(*d);
+	*c = add(*c, *d);
+	*b = xor(*b, *c);
+	*b = rotate_right_24(*b);
+}
+
+#[inline(always)]
+unsafe fn g2(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
+	*a = add(*a, *m);
+	*a = add(*a, *b);
+	*d = xor(*d, *a);
+	*d = rotate_right_16(*d);
+	*c = add(*c, *d);
+	*b = xor(*b, *c);
+	*b = rotate_right_63(*b);
+}
+
+// Note the optimization here of leaving b as the unrotated row, rather than a.
+// All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+#[inline(always)]
+unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
+	*a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
+	*d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
+	*c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
+}
+
+// Note the optimization here of leaving b as the unrotated row, rather than a.
+// All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+#[inline(always)]
+unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
+	*a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
+	*d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
+	*c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
+}
+
+
+#[cfg(test)]
+mod tests {
+	#[test]
+	fn test_mm_shuffle() {
+		assert_eq!(_MM_SHUFFLE!(0, 1, 1, 3), 0b00_01_01_11);
+		assert_eq!(_MM_SHUFFLE!(3, 1, 1, 0), 0b11_01_01_00);
+		assert_eq!(_MM_SHUFFLE!(1, 2, 2, 1), 0b01_10_10_01);
+	}
+}
--- a/util/EIP-152/src/lib.rs
+++ b/util/EIP-152/src/lib.rs
@ -14,19 +14,23 @@
 // You should have received a copy of the GNU General Public License
 // along with Parity Ethereum.  If not, see <http://www.gnu.org/licenses/>.

+pub mod portable;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub mod avx2;
+
 /// The precomputed values for BLAKE2b [from the spec](https://tools.ietf.org/html/rfc7693#section-2.7)
 /// There are 10 16-byte arrays - one for each round
 /// the entries are calculated from the sigma constants.
 const SIGMA: [[usize; 16]; 10] = [
-	[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
+	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
 	[14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
 	[11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
-	[ 7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8],
-	[ 9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13],
-	[ 2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9],
+	[7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
+	[9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
+	[2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
 	[12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
 	[13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
-	[ 6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5],
+	[6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
 	[10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
 ];

@ -38,58 +42,30 @@ const IV: [u64; 8] = [
 	0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
 ];

-
-#[inline(always)]
-/// The G mixing function. See https://tools.ietf.org/html/rfc7693#section-3.1
-fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
-	v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
-	v[d] = (v[d] ^ v[a]).rotate_right(32);
-	v[c] = v[c].wrapping_add(v[d]);
-	v[b] = (v[b] ^ v[c]).rotate_right(24);
-	v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
-	v[d] = (v[d] ^ v[a]).rotate_right(16);
-	v[c] = v[c].wrapping_add(v[d]);
-	v[b] = (v[b] ^ v[c]).rotate_right(63);
-}
-
-/// The Blake2 compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
-/// Takes as an argument the state vector `h`, message block vector `m`, offset counter `t`, final
-/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
-/// parameter is modified by the function.
-pub fn compress(h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool, rounds: usize) {
-	let mut v = [0u64; 16];
-	v[..h.len()].copy_from_slice(h);    // First half from state.
-	v[h.len()..].copy_from_slice(&IV);  // Second half from IV.
-
-	v[12] ^= t[0];
-	v[13] ^= t[1];
-
-	if f {
-		v[14] = !v[14] // Invert all bits if the last-block-flag is set.
+/// blake2b compression function
+pub fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
+	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+	{
+		if is_x86_feature_detected!("avx2") {
+			unsafe {
+				return avx2::compress(state, message, count, f, rounds)
 			}
-	for i in 0..rounds {
-		// Message word selection permutation for this round.
-		let s = &SIGMA[i % 10];
-		g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
-		g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
-		g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
-		g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
-
-		g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
-		g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
-		g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
-		g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
+		} else {
+			return portable::compress(state, message, count, f, rounds)
+		};
 	}

-	for i in 0..8 {
-		h[i] ^= v[i] ^ v[i + 8];
-	}
+	#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+	portable::compress(state, message, count, f, rounds);
 }


 #[cfg(test)]
 mod tests {
-	use crate::compress;
+	use crate::portable;
+
+	#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+	use crate::avx2;
 	use rustc_hex::FromHex;

 	#[test]
@ -119,9 +95,27 @@ mod tests {
 			0x5A92F1DBA88AD318_u64, 0x239900D4ED8623B9_u64,
 		];

-		compress(&mut h_in, m, c, f, rounds);
-
+		// portable
+		portable::compress(&mut h_in, m, c, f, rounds);
 		assert_eq!(h_in, h_out);
+
+		let mut h_in = [
+			0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
+			0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
+			0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
+			0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
+		];
+
+		// avx
+		#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+		{
+			if is_x86_feature_detected!("avx2") {
+				unsafe {
+					avx2::compress(&mut h_in, m, c, f, rounds);
+					assert_eq!(h_in, h_out);
+				}
+			}
+		}
 	}

 	fn to_u64_slice(vec: &[u8], slice: &mut [u64]) {
@ -130,6 +124,7 @@ mod tests {
 		})
 	}

+
 	#[test]
 	fn test_vectors_from_eip() {
 		let vec = vec![
@ -178,15 +173,27 @@ mod tests {
 			to_u64_slice(&bytes[4..68], &mut h);
 			to_u64_slice(&bytes[68..196], &mut m);
 			to_u64_slice(&bytes[196..212], &mut t);
-
-			compress(&mut h, m, t, f, rounds as usize);
-
 			let output: Vec<u8> = output.from_hex().unwrap();
-
 			let mut out = [0u64; 8];
 			to_u64_slice(&output[..], &mut out);

+			#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+			{
+				// avx
+				if is_x86_feature_detected!("avx2") {
+					unsafe {
+						avx2::compress(&mut h, m, t, f, rounds as usize);
 						assert_eq!(out, h);
 					}
 				}
+			}
+
+			{
+				// portable
+				to_u64_slice(&bytes[4..68], &mut h);
+				portable::compress(&mut h, m, t, f, rounds as usize);
+				assert_eq!(out, h);
+			}
+		}
+	}
 }
--- a/util/EIP-152/src/portable.rs
+++ b/util/EIP-152/src/portable.rs
@ -0,0 +1,67 @@
+// Copyright 2015-2019 Parity Technologies (UK) Ltd.
+// This file is part of Parity Ethereum.
+
+// Parity Ethereum is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Parity Ethereum is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Parity Ethereum.  If not, see <http://www.gnu.org/licenses/>.
+//! Portable implementation of the blake2b compress function
+
+use crate::{IV, SIGMA};
+
+/// The G mixing function. See https://tools.ietf.org/html/rfc7693#section-3.1
+#[inline(always)]
+fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
+	v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
+	v[d] = (v[d] ^ v[a]).rotate_right(32);
+	v[c] = v[c].wrapping_add(v[d]);
+	v[b] = (v[b] ^ v[c]).rotate_right(24);
+
+	v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
+	v[d] = (v[d] ^ v[a]).rotate_right(16);
+	v[c] = v[c].wrapping_add(v[d]);
+	v[b] = (v[b] ^ v[c]).rotate_right(63);
+}
+
+/// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
+/// Takes as an argument the state vector `h`, message block vector `m`, offset counter `t`, final
+/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
+/// parameter is modified by the function.
+pub fn compress(h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool, rounds: usize) {
+	let mut v = [0u64; 16];
+	v[..8].copy_from_slice(h);    // First half from state.
+	v[8..].copy_from_slice(&IV);  // Second half from IV.
+
+	v[12] ^= t[0];
+	v[13] ^= t[1];
+
+	if f {
+		v[14] = !v[14]; // Invert all bits if the last-block-flag is set.
+	}
+
+	for i in 0..rounds {
+		// Message word selection permutation for this round.
+		let s = &SIGMA[i % 10];
+		g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
+		g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
+		g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
+		g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
+
+		g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
+		g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
+		g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
+		g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
+	}
+
+	for i in 0..8 {
+		h[i] ^= v[i] ^ v[i + 8];
+	}
+}