// Copyright 2015-2019 Parity Technologies (UK) Ltd.
// This file is part of Parity Ethereum.

// Parity Ethereum is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Parity Ethereum is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Parity Ethereum.  If not, see <http://www.gnu.org/licenses/>.

//! AVX2 implementation of the blake2b compression function.
use crate::IV;

#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use arrayref::{array_refs, mut_array_refs};

// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
macro_rules! _MM_SHUFFLE {
    ($z:expr, $y:expr, $x:expr, $w:expr) => {
        ($z << 6) | ($y << 4) | ($x << 2) | $w
    };
}

/// The Blake2b compression function F. See https://tools.ietf.org/html/rfc7693#section-3.2
/// Takes as an argument the state vector `state`, message block vector `message`, offset counter, final
/// block indicator flag `f`, and number of rounds `rounds`. The state vector provided as the first
/// parameter is modified by the function.
///
/// `g1` only operates on `x` from the original g function.
///  ```
/// fn portable_g1(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, x: u64) {
///		v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
///		v[d] = (v[d] ^ v[a]).rotate_right(32);
///		v[c] = v[c].wrapping_add(v[d]);
///		v[b] = (v[b] ^ v[c]).rotate_right(24);
/// }
/// ```
///
/// `g2` only operates on `y` from the originial g function.
/// ```
/// fn portable_g2(v: &mut [u64], a: usize, b: usize, c: usize, d: usize, y: u64) {
///		v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
///		v[d] = (v[d] ^ v[a]).rotate_right(16);
///		v[c] = v[c].wrapping_add(v[d]);
///		v[b] = (v[b] ^ v[c]).rotate_right(63);
/// }
/// ```
///
/// Message mixing is done based on sigma values, for a given round.
///
/// # Example
///
/// `SIGMA` for round 1 i.e `SIGMA[0]` = `[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]`;
/// ```
///  let s = &SIGMA[0 % 10];
/// //        a, b, c, d,    x
/// g(&mut v, 0, 4, 8 , 12, m[s[0]]);
///	g(&mut v, 1, 5, 9 , 13, m[s[2]]);
///	g(&mut v, 2, 6, 10, 14, m[s[4]]);
///	g(&mut v, 3, 7, 11, 15, m[s[6]]);
///
/// let a = v[..4];
/// let b = v[4..8];
/// let c = v[8..12];
/// let d = v[12..16];
/// let mut b0 = [m[0], m[2], m[4], m[6]];
///
///  g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
/// // ... then contruct b0 for `g2` etc.
/// ```
///
#[target_feature(enable = "avx2")]
pub unsafe fn compress(state: &mut [u64; 8], message: [u64; 16], count: [u64; 2], f: bool, rounds: usize) {
	// get a mutable reference to state[0..4], state[4..]
	let (state_low, state_high) = mut_array_refs!(state, 4, 4);
	// get a reference to IV[0..4], IV[4..]
	let (iv_low, iv_high) = array_refs!(&IV, 4, 4);

	// loads them into an __m256i
	let mut a = loadu(state_low);
	let mut b = loadu(state_high);
	let mut c = loadu(iv_low);

	// !a = xor(a, xor(a, !a))
	let inverse = if f {
		iv_high[3] ^ !iv_high[3]
	} else {
		0
	};

	let flags = set4(
		count[0],
		count[1],
		inverse,
		0,
	);

	let mut d = xor(loadu(iv_high), flags);

	// get a reference to message[(0..2)+,]
	let msg_chunks = array_refs!(&message, 2, 2, 2, 2, 2, 2, 2, 2);
	// load each message [u64; 2] into an __m128i, broadcast it into both lanes of an __m256i.

	// m0 = __m256i([message[0], message[1], message[0], message[1]])
	let m0 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.0));
	// m1 = __m256i([message[2], message[3], message[2], message[3]])
	let m1 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.1));
	// m2 = __m256i([message[4], message[5], message[4], message[5]])
	let m2 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.2));
	// m3 = __m256i([message[6], message[7], message[6], message[7]])
	let m3 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.3));
	// m4 = __m256i([message[8], message[9], message[8], message[9]])
	let m4 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.4));
	// m5 = __m256i([message[10], message[11], message[10], message[11]])
	let m5 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.5));
	// m6 = __m256i([message[12], message[13], message[12], message[13]])
	let m6 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.6));
	// m7 = __m256i([message[14], message[15], message[14], message[15]])
	let m7 = _mm256_broadcastsi128_si256(loadu_128(msg_chunks.7));

	let iv0 = a;
	let iv1 = b;

	let mut t0;
	let mut t1;
	let mut b0;

	for i in 0..rounds {
		match i % 10 {
			0 => {
				t0 = _mm256_unpacklo_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [0, 2, 0, 2]
				t1 = _mm256_unpacklo_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [4, 6, 4, 6]
				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([0, 2, 0, 2], [4, 6, 4, 6]) = [0, 2, 4, 6]
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpackhi_epi64(m0, m1); // ([0, 1, 0, 1], [2, 3, 2, 3]) = [1, 3, 1, 3]
				t1 = _mm256_unpackhi_epi64(m2, m3); // ([4, 5, 4, 5], [6, 7, 6, 7]) = [5, 7, 5, 7]
				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([1, 3, 1, 3], [5, 7, 5, 7]) = [1, 3, 5, 7]
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				diagonalize(&mut a, &mut b, &mut c, &mut d);
				t0 = _mm256_unpacklo_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [14, 8, 14, 8]
				t1 = _mm256_unpacklo_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [10, 12, 10, 12]
				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([14, 8, 14, 8], [10, 12, 10, 12]) = [14, 8, 10, 12]
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpackhi_epi64(m7, m4); // ([14, 15, 14, 15], [8, 9, 8, 9]) = [15, 9, 15, 9]
				t1 = _mm256_unpackhi_epi64(m5, m6); // ([10, 11, 10, 11], [12, 13, 12, 13]) = [11, 13, 11, 13]
				b0 = _mm256_blend_epi32(t0, t1, 0xF0); // ([15, 9, 15, 9], [11, 13, 11, 13]) = [15, 9, 11, 13]
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				undiagonalize(&mut a, &mut b, &mut c, &mut d);
			}
			1 => {
				t0 = _mm256_unpacklo_epi64(m7, m2);
				t1 = _mm256_unpackhi_epi64(m4, m6);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpacklo_epi64(m5, m4);
				t1 = _mm256_alignr_epi8(m3, m7, 8);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				diagonalize(&mut a, &mut b, &mut c, &mut d);
				t0 = _mm256_unpackhi_epi64(m2, m0);
				t1 = _mm256_blend_epi32(m5, m0, 0x33);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_alignr_epi8(m6, m1, 8);
				t1 = _mm256_blend_epi32(m3, m1, 0x33);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				undiagonalize(&mut a, &mut b, &mut c, &mut d);
			}
			2 => {
				// round 3
				t0 = _mm256_alignr_epi8(m6, m5, 8);
				t1 = _mm256_unpackhi_epi64(m2, m7);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpacklo_epi64(m4, m0);
				t1 = _mm256_blend_epi32(m6, m1, 0x33);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				diagonalize(&mut a, &mut b, &mut c, &mut d);
				t0 = _mm256_alignr_epi8(m5, m4, 8);
				t1 = _mm256_unpackhi_epi64(m1, m3);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpacklo_epi64(m2, m7);
				t1 = _mm256_blend_epi32(m0, m3, 0x33);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				undiagonalize(&mut a, &mut b, &mut c, &mut d);
			}
			3 => {
				// round 4
				t0 = _mm256_unpackhi_epi64(m3, m1);
				t1 = _mm256_unpackhi_epi64(m6, m5);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpackhi_epi64(m4, m0);
				t1 = _mm256_unpacklo_epi64(m6, m7);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				diagonalize(&mut a, &mut b, &mut c, &mut d);
				t0 = _mm256_alignr_epi8(m1, m7, 8);
				t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE!(1, 0, 3, 2));
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpacklo_epi64(m4, m3);
				t1 = _mm256_unpacklo_epi64(m5, m0);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				undiagonalize(&mut a, &mut b, &mut c, &mut d);
			}
			4 => {
				// round 5
				t0 = _mm256_unpackhi_epi64(m4, m2);
				t1 = _mm256_unpacklo_epi64(m1, m5);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_blend_epi32(m3, m0, 0x33);
				t1 = _mm256_blend_epi32(m7, m2, 0x33);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				diagonalize(&mut a, &mut b, &mut c, &mut d);
				t0 = _mm256_alignr_epi8(m7, m1, 8);
				t1 = _mm256_alignr_epi8(m3, m5, 8);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpackhi_epi64(m6, m0);
				t1 = _mm256_unpacklo_epi64(m6, m4);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				undiagonalize(&mut a, &mut b, &mut c, &mut d);
			}
			5 => {
				// round 6
				t0 = _mm256_unpacklo_epi64(m1, m3);
				t1 = _mm256_unpacklo_epi64(m0, m4);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpacklo_epi64(m6, m5);
				t1 = _mm256_unpackhi_epi64(m5, m1);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				diagonalize(&mut a, &mut b, &mut c, &mut d);
				t0 = _mm256_alignr_epi8(m2, m0, 8);
				t1 = _mm256_unpackhi_epi64(m3, m7);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpackhi_epi64(m4, m6);
				t1 = _mm256_alignr_epi8(m7, m2, 8);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				undiagonalize(&mut a, &mut b, &mut c, &mut d);
			}
			6 => {
				// round 7
				t0 = _mm256_blend_epi32(m0, m6, 0x33);
				t1 = _mm256_unpacklo_epi64(m7, m2);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpackhi_epi64(m2, m7);
				t1 = _mm256_alignr_epi8(m5, m6, 8);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				diagonalize(&mut a, &mut b, &mut c, &mut d);
				t0 = _mm256_unpacklo_epi64(m4, m0);
				t1 = _mm256_blend_epi32(m4, m3, 0x33);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpackhi_epi64(m5, m3);
				t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE!(1, 0, 3, 2));
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				undiagonalize(&mut a, &mut b, &mut c, &mut d);
			}
			7 => {
				// round 8
				t0 = _mm256_unpackhi_epi64(m6, m3);
				t1 = _mm256_blend_epi32(m1, m6, 0x33);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_alignr_epi8(m7, m5, 8);
				t1 = _mm256_unpackhi_epi64(m0, m4);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				diagonalize(&mut a, &mut b, &mut c, &mut d);
				t0 = _mm256_blend_epi32(m2, m1, 0x33);
				t1 = _mm256_alignr_epi8(m4, m7, 8);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpacklo_epi64(m5, m0);
				t1 = _mm256_unpacklo_epi64(m2, m3);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				undiagonalize(&mut a, &mut b, &mut c, &mut d);
			}
			8 => {
				// round 9
				t0 = _mm256_unpacklo_epi64(m3, m7);
				t1 = _mm256_alignr_epi8(m0, m5, 8);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpackhi_epi64(m7, m4);
				t1 = _mm256_alignr_epi8(m4, m1, 8);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				diagonalize(&mut a, &mut b, &mut c, &mut d);
				t0 = _mm256_unpacklo_epi64(m5, m6);
				t1 = _mm256_unpackhi_epi64(m6, m0);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_alignr_epi8(m1, m2, 8);
				t1 = _mm256_alignr_epi8(m2, m3, 8);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				undiagonalize(&mut a, &mut b, &mut c, &mut d);
			}
			_ => {
				// round 10
				t0 = _mm256_unpacklo_epi64(m5, m4);
				t1 = _mm256_unpackhi_epi64(m3, m0);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_unpacklo_epi64(m1, m2);
				t1 = _mm256_blend_epi32(m2, m3, 0x33);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				diagonalize(&mut a, &mut b, &mut c, &mut d);
				t0 = _mm256_unpackhi_epi64(m6, m7);
				t1 = _mm256_unpackhi_epi64(m4, m1);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g1(&mut a, &mut b, &mut c, &mut d, &mut b0);
				t0 = _mm256_blend_epi32(m5, m0, 0x33);
				t1 = _mm256_unpacklo_epi64(m7, m6);
				b0 = _mm256_blend_epi32(t0, t1, 0xF0);
				g2(&mut a, &mut b, &mut c, &mut d, &mut b0);
				undiagonalize(&mut a, &mut b, &mut c, &mut d);
			}
		}
	}

	a = xor(a, c);
	b = xor(b, d);
	a = xor(a, iv0);
	b = xor(b, iv1);

	storeu(a, state_low);
	storeu(b, state_high);
}


#[inline(always)]
unsafe fn loadu(src: *const [u64; 4]) -> __m256i {
	// This is an unaligned load, so the pointer cast is allowed.
	_mm256_loadu_si256(src as *const __m256i)
}

#[inline(always)]
unsafe fn storeu(src: __m256i, dest: *mut [u64; 4]) {
	// This is an unaligned store, so the pointer cast is allowed.
	_mm256_storeu_si256(dest as *mut __m256i, src)
}

#[inline(always)]
unsafe fn loadu_128(mem_addr: &[u64; 2]) -> __m128i {
	_mm_loadu_si128(mem_addr.as_ptr() as *const __m128i)
}

#[inline(always)]
unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
	_mm256_add_epi64(a, b)
}

#[inline(always)]
unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
	_mm256_xor_si256(a, b)
}

#[inline(always)]
unsafe fn set4(a: u64, b: u64, c: u64, d: u64) -> __m256i {
	_mm256_setr_epi64x(a as i64, b as i64, c as i64, d as i64)
}

#[inline(always)]
unsafe fn rotate_right_32(x: __m256i) -> __m256i {
	_mm256_shuffle_epi32(x, _MM_SHUFFLE!(2, 3, 0, 1))
}

#[inline(always)]
unsafe fn rotate_right_24(x: __m256i) -> __m256i {
	let rotate24 = _mm256_setr_epi8(
		3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13,
		14, 15, 8, 9, 10,
	);
	_mm256_shuffle_epi8(x, rotate24)
}

#[inline(always)]
unsafe fn rotate_right_16(x: __m256i) -> __m256i {
	let rotate16 = _mm256_setr_epi8(
		2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12,
		13, 14, 15, 8, 9,
	);
	_mm256_shuffle_epi8(x, rotate16)
}

#[inline(always)]
unsafe fn rotate_right_63(x: __m256i) -> __m256i {
	_mm256_or_si256(_mm256_srli_epi64(x, 63), add(x, x))
}

#[inline(always)]
unsafe fn g1(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
	*a = add(*a, *m);
	*a = add(*a, *b);
	*d = xor(*d, *a);
	*d = rotate_right_32(*d);
	*c = add(*c, *d);
	*b = xor(*b, *c);
	*b = rotate_right_24(*b);
}

#[inline(always)]
unsafe fn g2(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, m: &mut __m256i) {
	*a = add(*a, *m);
	*a = add(*a, *b);
	*d = xor(*d, *a);
	*d = rotate_right_16(*d);
	*c = add(*c, *d);
	*b = xor(*b, *c);
	*b = rotate_right_63(*b);
}

// Note the optimization here of leaving b as the unrotated row, rather than a.
// All the message loads below are adjusted to compensate for this. See
// discussion at https://github.com/sneves/blake2-avx2/pull/4
#[inline(always)]
unsafe fn diagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
	*a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(2, 1, 0, 3));
	*d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
	*c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(0, 3, 2, 1));
}

// Note the optimization here of leaving b as the unrotated row, rather than a.
// All the message loads below are adjusted to compensate for this. See
// discussion at https://github.com/sneves/blake2-avx2/pull/4
#[inline(always)]
unsafe fn undiagonalize(a: &mut __m256i, _b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
	*a = _mm256_permute4x64_epi64(*a, _MM_SHUFFLE!(0, 3, 2, 1));
	*d = _mm256_permute4x64_epi64(*d, _MM_SHUFFLE!(1, 0, 3, 2));
	*c = _mm256_permute4x64_epi64(*c, _MM_SHUFFLE!(2, 1, 0, 3));
}


#[cfg(test)]
mod tests {
	#[test]
	fn test_mm_shuffle() {
		assert_eq!(_MM_SHUFFLE!(0, 1, 1, 3), 0b00_01_01_11);
		assert_eq!(_MM_SHUFFLE!(3, 1, 1, 0), 0b11_01_01_00);
		assert_eq!(_MM_SHUFFLE!(1, 2, 2, 1), 0b01_10_10_01);
	}
}