diff --git a/Cargo.lock b/Cargo.lock
index 88f1028d8..48206bddc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -810,6 +810,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
name = "eip-152"
version = "0.1.0"
dependencies = [
+ "arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
+ "criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-hex 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
diff --git a/util/EIP-152/Cargo.toml b/util/EIP-152/Cargo.toml
index fe65d0110..fb474919f 100644
--- a/util/EIP-152/Cargo.toml
+++ b/util/EIP-152/Cargo.toml
@@ -12,3 +12,11 @@ edition = "2018"
[dependencies]
rustc-hex = "2.0.1"
+arrayref = "0.3.5"
+
+[dev-dependencies]
+criterion = "0.3"
+
+[[bench]]
+name = "bench"
+harness = false
diff --git a/util/EIP-152/LICENSE b/util/EIP-152/LICENSE
new file mode 100644
index 000000000..f5ab386d4
--- /dev/null
+++ b/util/EIP-152/LICENSE
@@ -0,0 +1,25 @@
+This program is copyright 2019 Parity Technologies Limited and its licensors.
+
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+Some portions of the program (“the Software”) are Copyright (c) 2018 Jack O'Connor
+and the following relates solely to such portions:
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/util/EIP-152/benches/bench.rs b/util/EIP-152/benches/bench.rs
new file mode 100644
index 000000000..8f6278af8
--- /dev/null
+++ b/util/EIP-152/benches/bench.rs
@@ -0,0 +1,191 @@
+// Copyright 2015-2019 Parity Technologies (UK) Ltd.
+// This file is part of Parity Ethereum.
+
+// Parity Ethereum is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Parity Ethereum is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Parity Ethereum. If not, see .
+
+
+use criterion::{Criterion, criterion_group, criterion_main, black_box, Throughput, BenchmarkId};
+use std::mem;
+use std::sync::atomic::{AtomicPtr, Ordering};
+use eip_152::portable;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use eip_152::avx2;
+
+type FnRaw = *mut ();
+type Blake2bF = fn(&mut [u64; 8], [u64; 16], [u64; 2], bool, usize);
+
+static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw);
+
+fn detect(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
+ let fun = if is_x86_feature_detected!("avx2") {
+ avx2::compress as FnRaw
+ } else {
+ portable::compress as FnRaw
+ };
+ FN.store(fun as FnRaw, Ordering::Relaxed);
+ unsafe {
+ mem::transmute::(fun)(state, message, count, f, rounds)
+ }
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub fn avx_ifunc_benchmark(c: &mut Criterion) {
+ let mut group = c.benchmark_group("avx2_ifunc");
+
+ for rounds in [12, 50, 100].iter() {
+ group.throughput(Throughput::Elements(*rounds as u64));
+ group.bench_with_input(
+ BenchmarkId::new("rounds", rounds),
+ &rounds,
+ |b, rounds| {
+ let mut state = [
+ 0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
+ 0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
+ 0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
+ 0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
+ ];
+
+ let message = [
+ 0x0000000000636261_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ ];
+ let count = [3, 0];
+ let f = true;
+
+ b.iter(|| {
+ unsafe {
+ let fun = FN.load(Ordering::Relaxed);
+ mem::transmute::
+ (fun)
+ (
+ black_box(&mut state),
+ black_box(message),
+ black_box(count),
+ black_box(f),
+ black_box(**rounds as usize),
+ );
+ }
+ });
+ },
+ );
+ }
+
+ group.finish();
+}
+
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub fn avx_benchmark(c: &mut Criterion) {
+ let mut group = c.benchmark_group("avx2");
+
+ for rounds in [12, 50, 100].iter() {
+ group.throughput(Throughput::Elements(*rounds as u64));
+ group.bench_with_input(
+ BenchmarkId::new("rounds", rounds),
+ &rounds,
+ |b, rounds| {
+ let mut state = [
+ 0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
+ 0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
+ 0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
+ 0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
+ ];
+
+ let message = [
+ 0x0000000000636261_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ ];
+ let count = [3, 0];
+ let f = true;
+
+ b.iter(|| {
+
+ unsafe {
+ avx2::compress(
+ black_box(&mut state),
+ black_box(message),
+ black_box(count),
+ black_box(f),
+ black_box(**rounds as usize),
+ );
+ }
+ });
+ },
+ );
+ }
+
+ group.finish();
+}
+
+
+pub fn portable_benchmark(c: &mut Criterion) {
+ let mut group = c.benchmark_group("portable_impl");
+
+ for rounds in [12, 50, 100].iter() {
+ group.throughput(Throughput::Elements(*rounds as u64));
+ group.bench_with_input(
+ BenchmarkId::new("rounds", rounds),
+ &rounds,
+ |b, rounds| {
+ let mut state = [
+ 0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
+ 0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
+ 0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
+ 0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
+ ];
+
+ let message = [
+ 0x0000000000636261_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ 0x0000000000000000_u64, 0x0000000000000000_u64,
+ ];
+ let count = [3, 0];
+ let f = true;
+
+ b.iter(|| {
+ portable::compress(
+ black_box(&mut state),
+ black_box(message),
+ black_box(count),
+ black_box(f),
+ black_box(**rounds as usize),
+ );
+ });
+ },
+ );
+ }
+
+ group.finish();
+}
+
+criterion_group!(benches, avx_benchmark, avx_ifunc_benchmark, portable_benchmark);
+criterion_main!(benches);
\ No newline at end of file
diff --git a/util/EIP-152/src/avx2.rs b/util/EIP-152/src/avx2.rs
new file mode 100644
index 000000000..a619698bd
--- /dev/null
+++ b/util/EIP-152/src/avx2.rs
@@ -0,0 +1,471 @@
+// Copyright 2015-2019 Parity Technologies (UK) Ltd.
+// This file is part of Parity Ethereum.
+
+// Parity Ethereum is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Parity Ethereum is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Parity Ethereum. If not, see .
+
+//! AVX2 implementation of the blake2b compression function.
+use crate::IV;
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+use arrayref::{array_refs, mut_array_refs};
+
+// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
+macro_rules! _MM_SHUFFLE {
+ ($z:expr, $y:expr, $x:expr, $w:expr) => {
+ ($z << 6) | ($y << 4) | ($x << 2) | $w
+ };
+}
+
+/// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
+/// Takes as an argument the state vector `state`, message block vector `message`, offset counter, final
+/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
+/// parameter is modified by the function.
+///
+/// `g1` only operates on `x` from the original g function.
+/// ```
+/// fn portable_g1(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64) {
+/// v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
+/// v[d] = (v[d] ^ v[a]).rotate_right(32);
+/// v[c] = v[c].wrapping_add(v[d]);
+/// v[b] = (v[b] ^ v[c]).rotate_right(24);
+/// }
+/// ```
+///
+/// `g2` only operates on `y` from the originial g function.
+/// ```
+/// fn portable_g2(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, y: u64) {
+/// v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
+/// v[d] = (v[d] ^ v[a]).rotate_right(16);
+/// v[c] = v[c].wrapping_add(v[d]);
+/// v[b] = (v[b] ^ v[c]).rotate_right(63);
+/// }
+/// ```
+///
+/// Message mixing is done based on sigma values, for a given round.
+///
+/// # Example
+///
+/// `SIGMA` for round 1 i.e `SIGMA[0]` = `[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]`;
+/// ```
+/// let s = &SIGMA[0 % 10];
+/// // a, b, c, d, x
+/// g(&mut v, 0, 4, 8 , 12, m[s[0]]);
+/// g(&mut v, 1, 5, 9 , 13, m[s[2]]);
+/// g(&mut v, 2, 6, 10, 14, m[s[4]]);
+/// g(&mut v, 3, 7, 11, 15, m[s[6]]);
+///
+/// let a = v[..4];
+/// let b = v[4..8];
+/// let c = v[8..12];
+/// let d = v[12..16];
+/// let mut b0 = [m[0], m[2], m[4], m[6]];
+///
+/// g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+/// // ... then contruct b0 for `g2` etc.
+/// ```
+///
+#[target_feature(enable = "avx2")]
+pub unsafe fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
+ // get a mutable reference to state[0..4], state[4..]
+ let (state_low, state_high) = mut_array_refs!(state, 4, 4);
+ // get a reference to IV[0..4], IV[4..]
+ let (iv_low, iv_high) = array_refs!(&IV, 4, 4);
+
+ // loads them into an __m256i
+ let mut a = loadu(state_low);
+ let mut b = loadu(state_high);
+ let mut c = loadu(iv_low);
+
+ // !a = xor(a, xor(a, !a))
+ let inverse = if f {
+ iv_high[3] ^ !iv_high[3]
+ } else {
+ 0
+ };
+
+ let flags = set4(
+ count[0],
+ count[1],
+ inverse,
+ 0,
+ );
+
+ let mut d = xor(loadu(iv_high), flags);
+
+ // get a reference to message[(0..2)+,]
+ let msg_chunks = array_refs!(&message, 2, 2, 2, 2, 2, 2, 2, 2);
+ // load each message [u64; 2] into an __m128i, broadcast it into both lanes of an __m256i.
+
+ // m0 = __m256i([message[0], message[1], message[0], message[1]])
+ let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
+ // m1 = __m256i([message[2], message[3], message[2], message[3]])
+ let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
+ // m2 = __m256i([message[4], message[5], message[4], message[5]])
+ let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
+ // m3 = __m256i([message[6], message[7], message[6], message[7]])
+ let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
+ // m4 = __m256i([message[8], message[9], message[8], message[9]])
+ let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
+ // m5 = __m256i([message[10], message[11], message[10], message[11]])
+ let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
+ // m6 = __m256i([message[12], message[13], message[12], message[13]])
+ let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
+ // m7 = __m256i([message[14], message[15], message[14], message[15]])
+ let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));
+
+ let iv0 = a;
+ let iv1 = b;
+
+ let mut t0;
+ let mut t1;
+ let mut b0;
+
+ for i in 0..rounds {
+ match i % 10 {
+ 0 => {
+ t0 = _mm256_unpacklo_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [0, 2, 0, 2]
+ t1 = _mm256_unpacklo_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [4, 6, 4, 6]
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([0, 2, 0, 2], [4, 6, 4, 6]) = [0, 2, 4, 6]
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [1, 3, 1, 3]
+ t1 = _mm256_unpackhi_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [5, 7, 5, 7]
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([1, 3, 1, 3], [5, 7, 5, 7]) = [1, 3, 5, 7]
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpacklo_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [14, 8, 14, 8]
+ t1 = _mm256_unpacklo_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [10, 12, 10, 12]
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([14, 8, 14, 8], [10, 12, 10, 12]) = [14, 8, 10, 12]
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [15, 9, 15, 9]
+ t1 = _mm256_unpackhi_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [11, 13, 11, 13]
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([15, 9, 15, 9], [11, 13, 11, 13]) = [15, 9, 11, 13]
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+ }
+ 1 => {
+ t0 = _mm256_unpacklo_epi64(m7, m2);
+ t1 = _mm256_unpackhi_epi64(m4, m6);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m5, m4);
+ t1 = _mm256_alignr_epi8(m3, m7, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpackhi_epi64(m2, m0);
+ t1 = _mm256_blend_epi32(m5, m0, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_alignr_epi8(m6, m1, 8);
+ t1 = _mm256_blend_epi32(m3, m1, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+ }
+ 2 => {
+ // round 3
+ t0 = _mm256_alignr_epi8(m6, m5, 8);
+ t1 = _mm256_unpackhi_epi64(m2, m7);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m4, m0);
+ t1 = _mm256_blend_epi32(m6, m1, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_alignr_epi8(m5, m4, 8);
+ t1 = _mm256_unpackhi_epi64(m1, m3);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m2, m7);
+ t1 = _mm256_blend_epi32(m0, m3, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+ }
+ 3 => {
+ // round 4
+ t0 = _mm256_unpackhi_epi64(m3, m1);
+ t1 = _mm256_unpackhi_epi64(m6, m5);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m4, m0);
+ t1 = _mm256_unpacklo_epi64(m6, m7);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_alignr_epi8(m1, m7, 8);
+ t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m4, m3);
+ t1 = _mm256_unpacklo_epi64(m5, m0);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+ }
+ 4 => {
+ // round 5
+ t0 = _mm256_unpackhi_epi64(m4, m2);
+ t1 = _mm256_unpacklo_epi64(m1, m5);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_blend_epi32(m3, m0, 0x33);
+ t1 = _mm256_blend_epi32(m7, m2, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_alignr_epi8(m7, m1, 8);
+ t1 = _mm256_alignr_epi8(m3, m5, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m6, m0);
+ t1 = _mm256_unpacklo_epi64(m6, m4);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+ }
+ 5 => {
+ // round 6
+ t0 = _mm256_unpacklo_epi64(m1, m3);
+ t1 = _mm256_unpacklo_epi64(m0, m4);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m6, m5);
+ t1 = _mm256_unpackhi_epi64(m5, m1);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_alignr_epi8(m2, m0, 8);
+ t1 = _mm256_unpackhi_epi64(m3, m7);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m4, m6);
+ t1 = _mm256_alignr_epi8(m7, m2, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+ }
+ 6 => {
+ // round 7
+ t0 = _mm256_blend_epi32(m0, m6, 0x33);
+ t1 = _mm256_unpacklo_epi64(m7, m2);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m2, m7);
+ t1 = _mm256_alignr_epi8(m5, m6, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpacklo_epi64(m4, m0);
+ t1 = _mm256_blend_epi32(m4, m3, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m5, m3);
+ t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+ }
+ 7 => {
+ // round 8
+ t0 = _mm256_unpackhi_epi64(m6, m3);
+ t1 = _mm256_blend_epi32(m1, m6, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_alignr_epi8(m7, m5, 8);
+ t1 = _mm256_unpackhi_epi64(m0, m4);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_blend_epi32(m2, m1, 0x33);
+ t1 = _mm256_alignr_epi8(m4, m7, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m5, m0);
+ t1 = _mm256_unpacklo_epi64(m2, m3);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+ }
+ 8 => {
+ // round 9
+ t0 = _mm256_unpacklo_epi64(m3, m7);
+ t1 = _mm256_alignr_epi8(m0, m5, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpackhi_epi64(m7, m4);
+ t1 = _mm256_alignr_epi8(m4, m1, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpacklo_epi64(m5, m6);
+ t1 = _mm256_unpackhi_epi64(m6, m0);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_alignr_epi8(m1, m2, 8);
+ t1 = _mm256_alignr_epi8(m2, m3, 8);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+ }
+ _ => {
+ // round 10
+ t0 = _mm256_unpacklo_epi64(m5, m4);
+ t1 = _mm256_unpackhi_epi64(m3, m0);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_unpacklo_epi64(m1, m2);
+ t1 = _mm256_blend_epi32(m2, m3, 0x33);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ diagonalize(&mut a, &mut b, &mut c, &mut d);
+ t0 = _mm256_unpackhi_epi64(m6, m7);
+ t1 = _mm256_unpackhi_epi64(m4, m1);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ t0 = _mm256_blend_epi32(m5, m0, 0x33);
+ t1 = _mm256_unpacklo_epi64(m7, m6);
+ b0 = _mm256_blend_epi32(t0, t1, 0xF0);
+ g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
+ undiagonalize(&mut a, &mut b, &mut c, &mut d);
+ }
+ }
+ }
+
+ a = xor(a, c);
+ b = xor(b, d);
+ a = xor(a, iv0);
+ b = xor(b, iv1);
+
+ storeu(a, state_low);
+ storeu(b, state_high);
+}
+
+
+#[inline(always)]
+unsafe fn loadu(src: *const [u64; 4]) -> __m256i {
+ // This is an unaligned load, so the pointer cast is allowed.
+ _mm256_loadu_si256(src as *const __m256i)
+}
+
+#[inline(always)]
+unsafe fn storeu(src: __m256i, dest: *mut [u64; 4]) {
+ // This is an unaligned store, so the pointer cast is allowed.
+ _mm256_storeu_si256(dest as *mut __m256i, src)
+}
+
+#[inline(always)]
+unsafe fn loadu_128(mem_addr: &[u64; 2]) -> __m128i {
+ _mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
+}
+
+#[inline(always)]
+unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
+ _mm256_add_epi64(a, b)
+}
+
+#[inline(always)]
+unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
+ _mm256_xor_si256(a, b)
+}
+
+#[inline(always)]
+unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
+ _mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
+}
+
+#[inline(always)]
+unsafe fn rotate_right_32(x: __m256i) -> __m256i {
+ _mm256_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1))
+}
+
+#[inline(always)]
+unsafe fn rotate_right_24(x: __m256i) -> __m256i {
+ let rotate24 = _mm256_setr_epi8(
+ 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13,
+ 14, 15, 8, 9, 10,
+ );
+ _mm256_shuffle_epi8(x, rotate24)
+}
+
+#[inline(always)]
+unsafe fn rotate_right_16(x: __m256i) -> __m256i {
+ let rotate16 = _mm256_setr_epi8(
+ 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12,
+ 13, 14, 15, 8, 9,
+ );
+ _mm256_shuffle_epi8(x, rotate16)
+}
+
+#[inline(always)]
+unsafe fn rotate_right_63(x: __m256i) -> __m256i {
+ _mm256_or_si256(_mm256_srli_epi64(x, 63), add(x, x))
+}
+
+#[inline(always)]
+unsafe fn g1(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
+ *a = add(*a, *m);
+ *a = add(*a, *b);
+ *d = xor(*d, *a);
+ *d = rotate_right_32(*d);
+ *c = add(*c, *d);
+ *b = xor(*b, *c);
+ *b = rotate_right_24(*b);
+}
+
+#[inline(always)]
+unsafe fn g2(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
+ *a = add(*a, *m);
+ *a = add(*a, *b);
+ *d = xor(*d, *a);
+ *d = rotate_right_16(*d);
+ *c = add(*c, *d);
+ *b = xor(*b, *c);
+ *b = rotate_right_63(*b);
+}
+
+// Note the optimization here of leaving b as the unrotated row, rather than a.
+// All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+#[inline(always)]
+unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
+ *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
+ *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
+ *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
+}
+
+// Note the optimization here of leaving b as the unrotated row, rather than a.
+// All the message loads below are adjusted to compensate for this. See
+// discussion at https://github.com/sneves/blake2-avx2/pull/4
+#[inline(always)]
+unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
+ *a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
+ *d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
+ *c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
+}
+
+
+#[cfg(test)]
+mod tests {
+ #[test]
+ fn test_mm_shuffle() {
+ assert_eq!(_MM_SHUFFLE!(0, 1, 1, 3), 0b00_01_01_11);
+ assert_eq!(_MM_SHUFFLE!(3, 1, 1, 0), 0b11_01_01_00);
+ assert_eq!(_MM_SHUFFLE!(1, 2, 2, 1), 0b01_10_10_01);
+ }
+}
diff --git a/util/EIP-152/src/lib.rs b/util/EIP-152/src/lib.rs
index fd68b9072..1e33842cd 100644
--- a/util/EIP-152/src/lib.rs
+++ b/util/EIP-152/src/lib.rs
@@ -14,20 +14,24 @@
// You should have received a copy of the GNU General Public License
// along with Parity Ethereum. If not, see .
+pub mod portable;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub mod avx2;
+
/// The precomputed values for BLAKE2b [from the spec](https://tools.ietf.org/html/rfc7693#section-2.7)
/// There are 10 16-byte arrays - one for each round
/// the entries are calculated from the sigma constants.
const SIGMA: [[usize; 16]; 10] = [
- [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
- [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
- [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
- [ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
- [ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
- [ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
- [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
- [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
- [ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
- [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+ [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
+ [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
+ [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
+ [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
+ [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
+ [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
+ [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
+ [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
+ [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
];
@@ -38,58 +42,30 @@ const IV: [u64; 8] = [
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
];
-
-#[inline(always)]
-/// The G mixing function. See https://tools.ietf.org/html/rfc7693#section-3.1
-fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
- v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
- v[d] = (v[d] ^ v[a]).rotate_right(32);
- v[c] = v[c].wrapping_add(v[d]);
- v[b] = (v[b] ^ v[c]).rotate_right(24);
- v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
- v[d] = (v[d] ^ v[a]).rotate_right(16);
- v[c] = v[c].wrapping_add(v[d]);
- v[b] = (v[b] ^ v[c]).rotate_right(63);
-}
-
-/// The Blake2 compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
-/// Takes as an argument the state vector `h`, message block vector `m`, offset counter `t`, final
-/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
-/// parameter is modified by the function.
-pub fn compress(h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool, rounds: usize) {
- let mut v = [0u64; 16];
- v[..h.len()].copy_from_slice(h); // First half from state.
- v[h.len()..].copy_from_slice(&IV); // Second half from IV.
-
- v[12] ^= t[0];
- v[13] ^= t[1];
-
- if f {
- v[14] = !v[14] // Invert all bits if the last-block-flag is set.
- }
- for i in 0..rounds {
- // Message word selection permutation for this round.
- let s = &SIGMA[i % 10];
- g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
- g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
- g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
- g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
-
- g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
- g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
- g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
- g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
+/// blake2b compression function
+pub fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ {
+ if is_x86_feature_detected!("avx2") {
+ unsafe {
+ return avx2::compress(state, message, count, f, rounds)
+ }
+ } else {
+ return portable::compress(state, message, count, f, rounds)
+ };
}
- for i in 0..8 {
- h[i] ^= v[i] ^ v[i + 8];
- }
+ #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+ portable::compress(state, message, count, f, rounds);
}
#[cfg(test)]
mod tests {
- use crate::compress;
+ use crate::portable;
+
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ use crate::avx2;
use rustc_hex::FromHex;
#[test]
@@ -119,9 +95,27 @@ mod tests {
0x5A92F1DBA88AD318_u64, 0x239900D4ED8623B9_u64,
];
- compress(&mut h_in, m, c, f, rounds);
-
+ // portable
+ portable::compress(&mut h_in, m, c, f, rounds);
assert_eq!(h_in, h_out);
+
+ let mut h_in = [
+ 0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
+ 0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
+ 0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
+ 0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
+ ];
+
+ // avx
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ {
+ if is_x86_feature_detected!("avx2") {
+ unsafe {
+ avx2::compress(&mut h_in, m, c, f, rounds);
+ assert_eq!(h_in, h_out);
+ }
+ }
+ }
}
fn to_u64_slice(vec: &[u8], slice: &mut [u64]) {
@@ -130,6 +124,7 @@ mod tests {
})
}
+
#[test]
fn test_vectors_from_eip() {
let vec = vec![
@@ -178,15 +173,27 @@ mod tests {
to_u64_slice(&bytes[4..68], &mut h);
to_u64_slice(&bytes[68..196], &mut m);
to_u64_slice(&bytes[196..212], &mut t);
-
- compress(&mut h, m, t, f, rounds as usize);
-
let output: Vec = output.from_hex().unwrap();
-
let mut out = [0u64; 8];
to_u64_slice(&output[..], &mut out);
- assert_eq!(out, h);
+ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+ {
+ // avx
+ if is_x86_feature_detected!("avx2") {
+ unsafe {
+ avx2::compress(&mut h, m, t, f, rounds as usize);
+ assert_eq!(out, h);
+ }
+ }
+ }
+
+ {
+ // portable
+ to_u64_slice(&bytes[4..68], &mut h);
+ portable::compress(&mut h, m, t, f, rounds as usize);
+ assert_eq!(out, h);
+ }
}
}
}
diff --git a/util/EIP-152/src/portable.rs b/util/EIP-152/src/portable.rs
new file mode 100644
index 000000000..f0b8ac9d7
--- /dev/null
+++ b/util/EIP-152/src/portable.rs
@@ -0,0 +1,67 @@
+// Copyright 2015-2019 Parity Technologies (UK) Ltd.
+// This file is part of Parity Ethereum.
+
+// Parity Ethereum is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Parity Ethereum is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Parity Ethereum. If not, see .
+//! Portable implementation of the blake2b compress function
+
+use crate::{IV, SIGMA};
+
+/// The G mixing function. See https://tools.ietf.org/html/rfc7693#section-3.1
+#[inline(always)]
+fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
+ v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
+ v[d] = (v[d] ^ v[a]).rotate_right(32);
+ v[c] = v[c].wrapping_add(v[d]);
+ v[b] = (v[b] ^ v[c]).rotate_right(24);
+
+ v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
+ v[d] = (v[d] ^ v[a]).rotate_right(16);
+ v[c] = v[c].wrapping_add(v[d]);
+ v[b] = (v[b] ^ v[c]).rotate_right(63);
+}
+
+/// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
+/// Takes as an argument the state vector `h`, message block vector `m`, offset counter `t`, final
+/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
+/// parameter is modified by the function.
+pub fn compress(h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool, rounds: usize) {
+ let mut v = [0u64; 16];
+ v[..8].copy_from_slice(h); // First half from state.
+ v[8..].copy_from_slice(&IV); // Second half from IV.
+
+ v[12] ^= t[0];
+ v[13] ^= t[1];
+
+ if f {
+ v[14] = !v[14]; // Invert all bits if the last-block-flag is set.
+ }
+
+ for i in 0..rounds {
+ // Message word selection permutation for this round.
+ let s = &SIGMA[i % 10];
+ g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
+ g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
+ g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
+ g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
+
+ g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
+ g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
+ g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
+ g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
+ }
+
+ for i in 0..8 {
+ h[i] ^= v[i] ^ v[i + 8];
+ }
+}