SIMD Implementation for EIP-152 (#11056)
* simd implementation
* adds benchmarks
* Update util/EIP-152/src/avx.rs
Co-Authored-By: David <dvdplm@gmail.com>
* enable avx 😅
* better benchmark, docs
* rename avx to avx2, compile on android
* fix android again
* remove ifunc, code formatting
* license
* nits
* docs, nits
* fix test
This commit is contained in:
parent
ad633de6d9
commit
f24bff5998
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -810,6 +810,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
name = "eip-152"
|
name = "eip-152"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"arrayref 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"rustc-hex 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"rustc-hex 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -12,3 +12,11 @@ edition = "2018"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
rustc-hex = "2.0.1"
|
rustc-hex = "2.0.1"
|
||||||
|
arrayref = "0.3.5"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
criterion = "0.3"
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "bench"
|
||||||
|
harness = false
|
||||||
|
25
util/EIP-152/LICENSE
Normal file
25
util/EIP-152/LICENSE
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
This program is copyright 2019 Parity Technologies Limited and its licensors.
|
||||||
|
|
||||||
|
GNU GENERAL PUBLIC LICENSE
|
||||||
|
Version 3, 29 June 2007
|
||||||
|
|
||||||
|
Some portions of the program (“the Software”) are Copyright (c) 2018 Jack O'Connor
|
||||||
|
and the following relates solely to such portions:
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
191
util/EIP-152/benches/bench.rs
Normal file
191
util/EIP-152/benches/bench.rs
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
// Copyright 2015-2019 Parity Technologies (UK) Ltd.
|
||||||
|
// This file is part of Parity Ethereum.
|
||||||
|
|
||||||
|
// Parity Ethereum is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
|
||||||
|
// Parity Ethereum is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with Parity Ethereum. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
|
||||||
|
use criterion::{Criterion, criterion_group, criterion_main, black_box, Throughput, BenchmarkId};
|
||||||
|
use std::mem;
|
||||||
|
use std::sync::atomic::{AtomicPtr, Ordering};
|
||||||
|
use eip_152::portable;
|
||||||
|
|
||||||
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||||
|
use eip_152::avx2;
|
||||||
|
|
||||||
|
type FnRaw = *mut ();
|
||||||
|
type Blake2bF = fn(&mut [u64; 8], [u64; 16], [u64; 2], bool, usize);
|
||||||
|
|
||||||
|
static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw);
|
||||||
|
|
||||||
|
fn detect(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
|
||||||
|
let fun = if is_x86_feature_detected!("avx2") {
|
||||||
|
avx2::compress as FnRaw
|
||||||
|
} else {
|
||||||
|
portable::compress as FnRaw
|
||||||
|
};
|
||||||
|
FN.store(fun as FnRaw, Ordering::Relaxed);
|
||||||
|
unsafe {
|
||||||
|
mem::transmute::<FnRaw, Blake2bF>(fun)(state, message, count, f, rounds)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||||
|
pub fn avx_ifunc_benchmark(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("avx2_ifunc");
|
||||||
|
|
||||||
|
for rounds in [12, 50, 100].iter() {
|
||||||
|
group.throughput(Throughput::Elements(*rounds as u64));
|
||||||
|
group.bench_with_input(
|
||||||
|
BenchmarkId::new("rounds", rounds),
|
||||||
|
&rounds,
|
||||||
|
|b, rounds| {
|
||||||
|
let mut state = [
|
||||||
|
0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
|
||||||
|
0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
|
||||||
|
0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
|
||||||
|
0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
|
||||||
|
];
|
||||||
|
|
||||||
|
let message = [
|
||||||
|
0x0000000000636261_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
];
|
||||||
|
let count = [3, 0];
|
||||||
|
let f = true;
|
||||||
|
|
||||||
|
b.iter(|| {
|
||||||
|
unsafe {
|
||||||
|
let fun = FN.load(Ordering::Relaxed);
|
||||||
|
mem::transmute::<FnRaw, Blake2bF>
|
||||||
|
(fun)
|
||||||
|
(
|
||||||
|
black_box(&mut state),
|
||||||
|
black_box(message),
|
||||||
|
black_box(count),
|
||||||
|
black_box(f),
|
||||||
|
black_box(**rounds as usize),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||||
|
pub fn avx_benchmark(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("avx2");
|
||||||
|
|
||||||
|
for rounds in [12, 50, 100].iter() {
|
||||||
|
group.throughput(Throughput::Elements(*rounds as u64));
|
||||||
|
group.bench_with_input(
|
||||||
|
BenchmarkId::new("rounds", rounds),
|
||||||
|
&rounds,
|
||||||
|
|b, rounds| {
|
||||||
|
let mut state = [
|
||||||
|
0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
|
||||||
|
0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
|
||||||
|
0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
|
||||||
|
0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
|
||||||
|
];
|
||||||
|
|
||||||
|
let message = [
|
||||||
|
0x0000000000636261_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
];
|
||||||
|
let count = [3, 0];
|
||||||
|
let f = true;
|
||||||
|
|
||||||
|
b.iter(|| {
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
avx2::compress(
|
||||||
|
black_box(&mut state),
|
||||||
|
black_box(message),
|
||||||
|
black_box(count),
|
||||||
|
black_box(f),
|
||||||
|
black_box(**rounds as usize),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
pub fn portable_benchmark(c: &mut Criterion) {
|
||||||
|
let mut group = c.benchmark_group("portable_impl");
|
||||||
|
|
||||||
|
for rounds in [12, 50, 100].iter() {
|
||||||
|
group.throughput(Throughput::Elements(*rounds as u64));
|
||||||
|
group.bench_with_input(
|
||||||
|
BenchmarkId::new("rounds", rounds),
|
||||||
|
&rounds,
|
||||||
|
|b, rounds| {
|
||||||
|
let mut state = [
|
||||||
|
0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
|
||||||
|
0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
|
||||||
|
0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
|
||||||
|
0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
|
||||||
|
];
|
||||||
|
|
||||||
|
let message = [
|
||||||
|
0x0000000000636261_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
0x0000000000000000_u64, 0x0000000000000000_u64,
|
||||||
|
];
|
||||||
|
let count = [3, 0];
|
||||||
|
let f = true;
|
||||||
|
|
||||||
|
b.iter(|| {
|
||||||
|
portable::compress(
|
||||||
|
black_box(&mut state),
|
||||||
|
black_box(message),
|
||||||
|
black_box(count),
|
||||||
|
black_box(f),
|
||||||
|
black_box(**rounds as usize),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
group.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group!(benches, avx_benchmark, avx_ifunc_benchmark, portable_benchmark);
|
||||||
|
criterion_main!(benches);
|
471
util/EIP-152/src/avx2.rs
Normal file
471
util/EIP-152/src/avx2.rs
Normal file
@ -0,0 +1,471 @@
|
|||||||
|
// Copyright 2015-2019 Parity Technologies (UK) Ltd.
|
||||||
|
// This file is part of Parity Ethereum.
|
||||||
|
|
||||||
|
// Parity Ethereum is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
|
||||||
|
// Parity Ethereum is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with Parity Ethereum. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
//! AVX2 implementation of the blake2b compression function.
|
||||||
|
use crate::IV;
|
||||||
|
|
||||||
|
#[cfg(target_arch = "x86")]
|
||||||
|
use core::arch::x86::*;
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
|
use core::arch::x86_64::*;
|
||||||
|
use arrayref::{array_refs, mut_array_refs};
|
||||||
|
|
||||||
|
// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
|
||||||
|
macro_rules! _MM_SHUFFLE {
|
||||||
|
($z:expr, $y:expr, $x:expr, $w:expr) => {
|
||||||
|
($z << 6) | ($y << 4) | ($x << 2) | $w
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
|
||||||
|
/// Takes as an argument the state vector `state`, message block vector `message`, offset counter, final
|
||||||
|
/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
|
||||||
|
/// parameter is modified by the function.
|
||||||
|
///
|
||||||
|
/// `g1` only operates on `x` from the original g function.
|
||||||
|
/// ```
|
||||||
|
/// fn portable_g1(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64) {
|
||||||
|
/// v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
|
||||||
|
/// v[d] = (v[d] ^ v[a]).rotate_right(32);
|
||||||
|
/// v[c] = v[c].wrapping_add(v[d]);
|
||||||
|
/// v[b] = (v[b] ^ v[c]).rotate_right(24);
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// `g2` only operates on `y` from the originial g function.
|
||||||
|
/// ```
|
||||||
|
/// fn portable_g2(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, y: u64) {
|
||||||
|
/// v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
|
||||||
|
/// v[d] = (v[d] ^ v[a]).rotate_right(16);
|
||||||
|
/// v[c] = v[c].wrapping_add(v[d]);
|
||||||
|
/// v[b] = (v[b] ^ v[c]).rotate_right(63);
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// Message mixing is done based on sigma values, for a given round.
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// `SIGMA` for round 1 i.e `SIGMA[0]` = `[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]`;
|
||||||
|
/// ```
|
||||||
|
/// let s = &SIGMA[0 % 10];
|
||||||
|
/// // a, b, c, d, x
|
||||||
|
/// g(&mut v, 0, 4, 8 , 12, m[s[0]]);
|
||||||
|
/// g(&mut v, 1, 5, 9 , 13, m[s[2]]);
|
||||||
|
/// g(&mut v, 2, 6, 10, 14, m[s[4]]);
|
||||||
|
/// g(&mut v, 3, 7, 11, 15, m[s[6]]);
|
||||||
|
///
|
||||||
|
/// let a = v[..4];
|
||||||
|
/// let b = v[4..8];
|
||||||
|
/// let c = v[8..12];
|
||||||
|
/// let d = v[12..16];
|
||||||
|
/// let mut b0 = [m[0], m[2], m[4], m[6]];
|
||||||
|
///
|
||||||
|
/// g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
/// // ... then contruct b0 for `g2` etc.
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
#[target_feature(enable = "avx2")]
|
||||||
|
pub unsafe fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
|
||||||
|
// get a mutable reference to state[0..4], state[4..]
|
||||||
|
let (state_low, state_high) = mut_array_refs!(state, 4, 4);
|
||||||
|
// get a reference to IV[0..4], IV[4..]
|
||||||
|
let (iv_low, iv_high) = array_refs!(&IV, 4, 4);
|
||||||
|
|
||||||
|
// loads them into an __m256i
|
||||||
|
let mut a = loadu(state_low);
|
||||||
|
let mut b = loadu(state_high);
|
||||||
|
let mut c = loadu(iv_low);
|
||||||
|
|
||||||
|
// !a = xor(a, xor(a, !a))
|
||||||
|
let inverse = if f {
|
||||||
|
iv_high[3] ^ !iv_high[3]
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
|
||||||
|
let flags = set4(
|
||||||
|
count[0],
|
||||||
|
count[1],
|
||||||
|
inverse,
|
||||||
|
0,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut d = xor(loadu(iv_high), flags);
|
||||||
|
|
||||||
|
// get a reference to message[(0..2)+,]
|
||||||
|
let msg_chunks = array_refs!(&message, 2, 2, 2, 2, 2, 2, 2, 2);
|
||||||
|
// load each message [u64; 2] into an __m128i, broadcast it into both lanes of an __m256i.
|
||||||
|
|
||||||
|
// m0 = __m256i([message[0], message[1], message[0], message[1]])
|
||||||
|
let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
|
||||||
|
// m1 = __m256i([message[2], message[3], message[2], message[3]])
|
||||||
|
let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
|
||||||
|
// m2 = __m256i([message[4], message[5], message[4], message[5]])
|
||||||
|
let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
|
||||||
|
// m3 = __m256i([message[6], message[7], message[6], message[7]])
|
||||||
|
let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
|
||||||
|
// m4 = __m256i([message[8], message[9], message[8], message[9]])
|
||||||
|
let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
|
||||||
|
// m5 = __m256i([message[10], message[11], message[10], message[11]])
|
||||||
|
let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
|
||||||
|
// m6 = __m256i([message[12], message[13], message[12], message[13]])
|
||||||
|
let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
|
||||||
|
// m7 = __m256i([message[14], message[15], message[14], message[15]])
|
||||||
|
let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));
|
||||||
|
|
||||||
|
let iv0 = a;
|
||||||
|
let iv1 = b;
|
||||||
|
|
||||||
|
let mut t0;
|
||||||
|
let mut t1;
|
||||||
|
let mut b0;
|
||||||
|
|
||||||
|
for i in 0..rounds {
|
||||||
|
match i % 10 {
|
||||||
|
0 => {
|
||||||
|
t0 = _mm256_unpacklo_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [0, 2, 0, 2]
|
||||||
|
t1 = _mm256_unpacklo_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [4, 6, 4, 6]
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([0, 2, 0, 2], [4, 6, 4, 6]) = [0, 2, 4, 6]
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpackhi_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [1, 3, 1, 3]
|
||||||
|
t1 = _mm256_unpackhi_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [5, 7, 5, 7]
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([1, 3, 1, 3], [5, 7, 5, 7]) = [1, 3, 5, 7]
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
diagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
t0 = _mm256_unpacklo_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [14, 8, 14, 8]
|
||||||
|
t1 = _mm256_unpacklo_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [10, 12, 10, 12]
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([14, 8, 14, 8], [10, 12, 10, 12]) = [14, 8, 10, 12]
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpackhi_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [15, 9, 15, 9]
|
||||||
|
t1 = _mm256_unpackhi_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [11, 13, 11, 13]
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([15, 9, 15, 9], [11, 13, 11, 13]) = [15, 9, 11, 13]
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
undiagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
}
|
||||||
|
1 => {
|
||||||
|
t0 = _mm256_unpacklo_epi64(m7, m2);
|
||||||
|
t1 = _mm256_unpackhi_epi64(m4, m6);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpacklo_epi64(m5, m4);
|
||||||
|
t1 = _mm256_alignr_epi8(m3, m7, 8);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
diagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
t0 = _mm256_unpackhi_epi64(m2, m0);
|
||||||
|
t1 = _mm256_blend_epi32(m5, m0, 0x33);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_alignr_epi8(m6, m1, 8);
|
||||||
|
t1 = _mm256_blend_epi32(m3, m1, 0x33);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
undiagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
}
|
||||||
|
2 => {
|
||||||
|
// round 3
|
||||||
|
t0 = _mm256_alignr_epi8(m6, m5, 8);
|
||||||
|
t1 = _mm256_unpackhi_epi64(m2, m7);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpacklo_epi64(m4, m0);
|
||||||
|
t1 = _mm256_blend_epi32(m6, m1, 0x33);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
diagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
t0 = _mm256_alignr_epi8(m5, m4, 8);
|
||||||
|
t1 = _mm256_unpackhi_epi64(m1, m3);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpacklo_epi64(m2, m7);
|
||||||
|
t1 = _mm256_blend_epi32(m0, m3, 0x33);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
undiagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
}
|
||||||
|
3 => {
|
||||||
|
// round 4
|
||||||
|
t0 = _mm256_unpackhi_epi64(m3, m1);
|
||||||
|
t1 = _mm256_unpackhi_epi64(m6, m5);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpackhi_epi64(m4, m0);
|
||||||
|
t1 = _mm256_unpacklo_epi64(m6, m7);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
diagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
t0 = _mm256_alignr_epi8(m1, m7, 8);
|
||||||
|
t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpacklo_epi64(m4, m3);
|
||||||
|
t1 = _mm256_unpacklo_epi64(m5, m0);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
undiagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
}
|
||||||
|
4 => {
|
||||||
|
// round 5
|
||||||
|
t0 = _mm256_unpackhi_epi64(m4, m2);
|
||||||
|
t1 = _mm256_unpacklo_epi64(m1, m5);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_blend_epi32(m3, m0, 0x33);
|
||||||
|
t1 = _mm256_blend_epi32(m7, m2, 0x33);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
diagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
t0 = _mm256_alignr_epi8(m7, m1, 8);
|
||||||
|
t1 = _mm256_alignr_epi8(m3, m5, 8);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpackhi_epi64(m6, m0);
|
||||||
|
t1 = _mm256_unpacklo_epi64(m6, m4);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
undiagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
}
|
||||||
|
5 => {
|
||||||
|
// round 6
|
||||||
|
t0 = _mm256_unpacklo_epi64(m1, m3);
|
||||||
|
t1 = _mm256_unpacklo_epi64(m0, m4);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpacklo_epi64(m6, m5);
|
||||||
|
t1 = _mm256_unpackhi_epi64(m5, m1);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
diagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
t0 = _mm256_alignr_epi8(m2, m0, 8);
|
||||||
|
t1 = _mm256_unpackhi_epi64(m3, m7);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpackhi_epi64(m4, m6);
|
||||||
|
t1 = _mm256_alignr_epi8(m7, m2, 8);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
undiagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
}
|
||||||
|
6 => {
|
||||||
|
// round 7
|
||||||
|
t0 = _mm256_blend_epi32(m0, m6, 0x33);
|
||||||
|
t1 = _mm256_unpacklo_epi64(m7, m2);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpackhi_epi64(m2, m7);
|
||||||
|
t1 = _mm256_alignr_epi8(m5, m6, 8);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
diagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
t0 = _mm256_unpacklo_epi64(m4, m0);
|
||||||
|
t1 = _mm256_blend_epi32(m4, m3, 0x33);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpackhi_epi64(m5, m3);
|
||||||
|
t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
undiagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
}
|
||||||
|
7 => {
|
||||||
|
// round 8
|
||||||
|
t0 = _mm256_unpackhi_epi64(m6, m3);
|
||||||
|
t1 = _mm256_blend_epi32(m1, m6, 0x33);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_alignr_epi8(m7, m5, 8);
|
||||||
|
t1 = _mm256_unpackhi_epi64(m0, m4);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
diagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
t0 = _mm256_blend_epi32(m2, m1, 0x33);
|
||||||
|
t1 = _mm256_alignr_epi8(m4, m7, 8);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpacklo_epi64(m5, m0);
|
||||||
|
t1 = _mm256_unpacklo_epi64(m2, m3);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
undiagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
}
|
||||||
|
8 => {
|
||||||
|
// round 9
|
||||||
|
t0 = _mm256_unpacklo_epi64(m3, m7);
|
||||||
|
t1 = _mm256_alignr_epi8(m0, m5, 8);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpackhi_epi64(m7, m4);
|
||||||
|
t1 = _mm256_alignr_epi8(m4, m1, 8);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
diagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
t0 = _mm256_unpacklo_epi64(m5, m6);
|
||||||
|
t1 = _mm256_unpackhi_epi64(m6, m0);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_alignr_epi8(m1, m2, 8);
|
||||||
|
t1 = _mm256_alignr_epi8(m2, m3, 8);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
undiagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// round 10
|
||||||
|
t0 = _mm256_unpacklo_epi64(m5, m4);
|
||||||
|
t1 = _mm256_unpackhi_epi64(m3, m0);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_unpacklo_epi64(m1, m2);
|
||||||
|
t1 = _mm256_blend_epi32(m2, m3, 0x33);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
diagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
t0 = _mm256_unpackhi_epi64(m6, m7);
|
||||||
|
t1 = _mm256_unpackhi_epi64(m4, m1);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
t0 = _mm256_blend_epi32(m5, m0, 0x33);
|
||||||
|
t1 = _mm256_unpacklo_epi64(m7, m6);
|
||||||
|
b0 = _mm256_blend_epi32(t0, t1, 0xF0);
|
||||||
|
g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
|
||||||
|
undiagonalize(&mut a, &mut b, &mut c, &mut d);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
a = xor(a, c);
|
||||||
|
b = xor(b, d);
|
||||||
|
a = xor(a, iv0);
|
||||||
|
b = xor(b, iv1);
|
||||||
|
|
||||||
|
storeu(a, state_low);
|
||||||
|
storeu(b, state_high);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn loadu(src: *const [u64; 4]) -> __m256i {
|
||||||
|
// This is an unaligned load, so the pointer cast is allowed.
|
||||||
|
_mm256_loadu_si256(src as *const __m256i)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn storeu(src: __m256i, dest: *mut [u64; 4]) {
|
||||||
|
// This is an unaligned store, so the pointer cast is allowed.
|
||||||
|
_mm256_storeu_si256(dest as *mut __m256i, src)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn loadu_128(mem_addr: &[u64; 2]) -> __m128i {
|
||||||
|
_mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
|
||||||
|
_mm256_add_epi64(a, b)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
|
||||||
|
_mm256_xor_si256(a, b)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
|
||||||
|
_mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn rotate_right_32(x: __m256i) -> __m256i {
|
||||||
|
_mm256_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn rotate_right_24(x: __m256i) -> __m256i {
|
||||||
|
let rotate24 = _mm256_setr_epi8(
|
||||||
|
3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13,
|
||||||
|
14, 15, 8, 9, 10,
|
||||||
|
);
|
||||||
|
_mm256_shuffle_epi8(x, rotate24)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn rotate_right_16(x: __m256i) -> __m256i {
|
||||||
|
let rotate16 = _mm256_setr_epi8(
|
||||||
|
2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12,
|
||||||
|
13, 14, 15, 8, 9,
|
||||||
|
);
|
||||||
|
_mm256_shuffle_epi8(x, rotate16)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn rotate_right_63(x: __m256i) -> __m256i {
|
||||||
|
_mm256_or_si256(_mm256_srli_epi64(x, 63), add(x, x))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn g1(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
|
||||||
|
*a = add(*a, *m);
|
||||||
|
*a = add(*a, *b);
|
||||||
|
*d = xor(*d, *a);
|
||||||
|
*d = rotate_right_32(*d);
|
||||||
|
*c = add(*c, *d);
|
||||||
|
*b = xor(*b, *c);
|
||||||
|
*b = rotate_right_24(*b);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn g2(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
|
||||||
|
*a = add(*a, *m);
|
||||||
|
*a = add(*a, *b);
|
||||||
|
*d = xor(*d, *a);
|
||||||
|
*d = rotate_right_16(*d);
|
||||||
|
*c = add(*c, *d);
|
||||||
|
*b = xor(*b, *c);
|
||||||
|
*b = rotate_right_63(*b);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note the optimization here of leaving b as the unrotated row, rather than a.
|
||||||
|
// All the message loads below are adjusted to compensate for this. See
|
||||||
|
// discussion at https://github.com/sneves/blake2-avx2/pull/4
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
|
||||||
|
*a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
|
||||||
|
*d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
|
||||||
|
*c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note the optimization here of leaving b as the unrotated row, rather than a.
|
||||||
|
// All the message loads below are adjusted to compensate for this. See
|
||||||
|
// discussion at https://github.com/sneves/blake2-avx2/pull/4
|
||||||
|
#[inline(always)]
|
||||||
|
unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
|
||||||
|
*a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
|
||||||
|
*d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
|
||||||
|
*c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
#[test]
|
||||||
|
fn test_mm_shuffle() {
|
||||||
|
assert_eq!(_MM_SHUFFLE!(0, 1, 1, 3), 0b00_01_01_11);
|
||||||
|
assert_eq!(_MM_SHUFFLE!(3, 1, 1, 0), 0b11_01_01_00);
|
||||||
|
assert_eq!(_MM_SHUFFLE!(1, 2, 2, 1), 0b01_10_10_01);
|
||||||
|
}
|
||||||
|
}
|
@ -14,20 +14,24 @@
|
|||||||
// You should have received a copy of the GNU General Public License
|
// You should have received a copy of the GNU General Public License
|
||||||
// along with Parity Ethereum. If not, see <http://www.gnu.org/licenses/>.
|
// along with Parity Ethereum. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
pub mod portable;
|
||||||
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||||
|
pub mod avx2;
|
||||||
|
|
||||||
/// The precomputed values for BLAKE2b [from the spec](https://tools.ietf.org/html/rfc7693#section-2.7)
|
/// The precomputed values for BLAKE2b [from the spec](https://tools.ietf.org/html/rfc7693#section-2.7)
|
||||||
/// There are 10 16-byte arrays - one for each round
|
/// There are 10 16-byte arrays - one for each round
|
||||||
/// the entries are calculated from the sigma constants.
|
/// the entries are calculated from the sigma constants.
|
||||||
const SIGMA: [[usize; 16]; 10] = [
|
const SIGMA: [[usize; 16]; 10] = [
|
||||||
[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
|
||||||
[14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
|
[14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
|
||||||
[11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
|
[11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
|
||||||
[ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
|
[7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
|
||||||
[ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
|
[9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
|
||||||
[ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
|
[2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
|
||||||
[12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
|
[12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
|
||||||
[13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
|
[13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
|
||||||
[ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
|
[6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
|
||||||
[10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
|
[10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|
||||||
@ -38,58 +42,30 @@ const IV: [u64; 8] = [
|
|||||||
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
|
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
|
||||||
];
|
];
|
||||||
|
|
||||||
|
/// blake2b compression function
|
||||||
#[inline(always)]
|
pub fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
|
||||||
/// The G mixing function. See https://tools.ietf.org/html/rfc7693#section-3.1
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||||
fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
|
{
|
||||||
v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
|
if is_x86_feature_detected!("avx2") {
|
||||||
v[d] = (v[d] ^ v[a]).rotate_right(32);
|
unsafe {
|
||||||
v[c] = v[c].wrapping_add(v[d]);
|
return avx2::compress(state, message, count, f, rounds)
|
||||||
v[b] = (v[b] ^ v[c]).rotate_right(24);
|
}
|
||||||
v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
|
} else {
|
||||||
v[d] = (v[d] ^ v[a]).rotate_right(16);
|
return portable::compress(state, message, count, f, rounds)
|
||||||
v[c] = v[c].wrapping_add(v[d]);
|
};
|
||||||
v[b] = (v[b] ^ v[c]).rotate_right(63);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The Blake2 compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
|
|
||||||
/// Takes as an argument the state vector `h`, message block vector `m`, offset counter `t`, final
|
|
||||||
/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
|
|
||||||
/// parameter is modified by the function.
|
|
||||||
pub fn compress(h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool, rounds: usize) {
|
|
||||||
let mut v = [0u64; 16];
|
|
||||||
v[..h.len()].copy_from_slice(h); // First half from state.
|
|
||||||
v[h.len()..].copy_from_slice(&IV); // Second half from IV.
|
|
||||||
|
|
||||||
v[12] ^= t[0];
|
|
||||||
v[13] ^= t[1];
|
|
||||||
|
|
||||||
if f {
|
|
||||||
v[14] = !v[14] // Invert all bits if the last-block-flag is set.
|
|
||||||
}
|
|
||||||
for i in 0..rounds {
|
|
||||||
// Message word selection permutation for this round.
|
|
||||||
let s = &SIGMA[i % 10];
|
|
||||||
g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
|
|
||||||
g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
|
|
||||||
g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
|
|
||||||
g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
|
|
||||||
|
|
||||||
g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
|
|
||||||
g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
|
|
||||||
g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
|
|
||||||
g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for i in 0..8 {
|
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
|
||||||
h[i] ^= v[i] ^ v[i + 8];
|
portable::compress(state, message, count, f, rounds);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::compress;
|
use crate::portable;
|
||||||
|
|
||||||
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||||
|
use crate::avx2;
|
||||||
use rustc_hex::FromHex;
|
use rustc_hex::FromHex;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -119,9 +95,27 @@ mod tests {
|
|||||||
0x5A92F1DBA88AD318_u64, 0x239900D4ED8623B9_u64,
|
0x5A92F1DBA88AD318_u64, 0x239900D4ED8623B9_u64,
|
||||||
];
|
];
|
||||||
|
|
||||||
compress(&mut h_in, m, c, f, rounds);
|
// portable
|
||||||
|
portable::compress(&mut h_in, m, c, f, rounds);
|
||||||
assert_eq!(h_in, h_out);
|
assert_eq!(h_in, h_out);
|
||||||
|
|
||||||
|
let mut h_in = [
|
||||||
|
0x6a09e667f2bdc948_u64, 0xbb67ae8584caa73b_u64,
|
||||||
|
0x3c6ef372fe94f82b_u64, 0xa54ff53a5f1d36f1_u64,
|
||||||
|
0x510e527fade682d1_u64, 0x9b05688c2b3e6c1f_u64,
|
||||||
|
0x1f83d9abfb41bd6b_u64, 0x5be0cd19137e2179_u64,
|
||||||
|
];
|
||||||
|
|
||||||
|
// avx
|
||||||
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||||
|
{
|
||||||
|
if is_x86_feature_detected!("avx2") {
|
||||||
|
unsafe {
|
||||||
|
avx2::compress(&mut h_in, m, c, f, rounds);
|
||||||
|
assert_eq!(h_in, h_out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn to_u64_slice(vec: &[u8], slice: &mut [u64]) {
|
fn to_u64_slice(vec: &[u8], slice: &mut [u64]) {
|
||||||
@ -130,6 +124,7 @@ mod tests {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_vectors_from_eip() {
|
fn test_vectors_from_eip() {
|
||||||
let vec = vec![
|
let vec = vec![
|
||||||
@ -178,15 +173,27 @@ mod tests {
|
|||||||
to_u64_slice(&bytes[4..68], &mut h);
|
to_u64_slice(&bytes[4..68], &mut h);
|
||||||
to_u64_slice(&bytes[68..196], &mut m);
|
to_u64_slice(&bytes[68..196], &mut m);
|
||||||
to_u64_slice(&bytes[196..212], &mut t);
|
to_u64_slice(&bytes[196..212], &mut t);
|
||||||
|
|
||||||
compress(&mut h, m, t, f, rounds as usize);
|
|
||||||
|
|
||||||
let output: Vec<u8> = output.from_hex().unwrap();
|
let output: Vec<u8> = output.from_hex().unwrap();
|
||||||
|
|
||||||
let mut out = [0u64; 8];
|
let mut out = [0u64; 8];
|
||||||
to_u64_slice(&output[..], &mut out);
|
to_u64_slice(&output[..], &mut out);
|
||||||
|
|
||||||
assert_eq!(out, h);
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||||
|
{
|
||||||
|
// avx
|
||||||
|
if is_x86_feature_detected!("avx2") {
|
||||||
|
unsafe {
|
||||||
|
avx2::compress(&mut h, m, t, f, rounds as usize);
|
||||||
|
assert_eq!(out, h);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// portable
|
||||||
|
to_u64_slice(&bytes[4..68], &mut h);
|
||||||
|
portable::compress(&mut h, m, t, f, rounds as usize);
|
||||||
|
assert_eq!(out, h);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
67
util/EIP-152/src/portable.rs
Normal file
67
util/EIP-152/src/portable.rs
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
// Copyright 2015-2019 Parity Technologies (UK) Ltd.
|
||||||
|
// This file is part of Parity Ethereum.
|
||||||
|
|
||||||
|
// Parity Ethereum is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
|
||||||
|
// Parity Ethereum is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with Parity Ethereum. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
//! Portable implementation of the blake2b compress function
|
||||||
|
|
||||||
|
use crate::{IV, SIGMA};
|
||||||
|
|
||||||
|
/// The G mixing function. See https://tools.ietf.org/html/rfc7693#section-3.1
|
||||||
|
#[inline(always)]
|
||||||
|
fn g(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
|
||||||
|
v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
|
||||||
|
v[d] = (v[d] ^ v[a]).rotate_right(32);
|
||||||
|
v[c] = v[c].wrapping_add(v[d]);
|
||||||
|
v[b] = (v[b] ^ v[c]).rotate_right(24);
|
||||||
|
|
||||||
|
v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
|
||||||
|
v[d] = (v[d] ^ v[a]).rotate_right(16);
|
||||||
|
v[c] = v[c].wrapping_add(v[d]);
|
||||||
|
v[b] = (v[b] ^ v[c]).rotate_right(63);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
|
||||||
|
/// Takes as an argument the state vector `h`, message block vector `m`, offset counter `t`, final
|
||||||
|
/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
|
||||||
|
/// parameter is modified by the function.
|
||||||
|
pub fn compress(h: &mut [u64; 8], m: [u64; 16], t: [u64; 2], f: bool, rounds: usize) {
|
||||||
|
let mut v = [0u64; 16];
|
||||||
|
v[..8].copy_from_slice(h); // First half from state.
|
||||||
|
v[8..].copy_from_slice(&IV); // Second half from IV.
|
||||||
|
|
||||||
|
v[12] ^= t[0];
|
||||||
|
v[13] ^= t[1];
|
||||||
|
|
||||||
|
if f {
|
||||||
|
v[14] = !v[14]; // Invert all bits if the last-block-flag is set.
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 0..rounds {
|
||||||
|
// Message word selection permutation for this round.
|
||||||
|
let s = &SIGMA[i % 10];
|
||||||
|
g(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
|
||||||
|
g(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
|
||||||
|
g(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
|
||||||
|
g(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
|
||||||
|
|
||||||
|
g(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
|
||||||
|
g(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
|
||||||
|
g(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
|
||||||
|
g(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 0..8 {
|
||||||
|
h[i] ^= v[i] ^ v[i + 8];
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user