// Copyright 2015-2020 Parity Technologies (UK) Ltd.
// This file is part of Parity.

// Parity is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Parity is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Parity.  If not, see <http://www.gnu.org/licenses/>.

//! ProgPoW (Programmatic Proof-of-Work) is the Ethereum network's proposed new Application-Specific Integrated
//! Circuit (ASIC) resistant Proof-of-Work mining algorithm.
//!
//! ProgPoW's aim is to reduce the efficiencies of specialized mining devices known as ASIC chips
//! (and accelerated GPU-based setups), and to maximize the performance of General Purpose Hardware (GPUs) to enable
//! more users to compete for new cryptocurrency awarded by the protocol.
//!
//! ASIC chips are those that are tailored to efficiently mining cryptocurrency based on a specific hashing algorithm.
//!
//! GPU mining setups are less specialised are struggle to compete for mining rewards.
//!
//! It would be a change from Ethereum's current PoW mining algorithm known as Ethash.
//!
//! ProgPoW audits have been proposed to analyse the efficiency of a ProgPoW ASICs over
//! GPUs and analysis of the economic impact on the Ethereum protocol.
//!
//! We use ProgPoW 0.9.3 version as suggested on Specification
//! https://github.com/ethereum/EIPs/blob/master/EIPS/eip-1057.md#specification

use compute::{FNV_PRIME, calculate_dag_item};
use keccak::H256;
use shared::{ETHASH_ACCESSES, ETHASH_MIX_BYTES, Node, get_data_size};

const PROGPOW_CACHE_BYTES: usize = 16 * 1024;
const PROGPOW_CACHE_WORDS: usize = PROGPOW_CACHE_BYTES / 4;
const PROGPOW_CNT_CACHE: usize = 11;
const PROGPOW_CNT_MATH: usize = 18;
const PROGPOW_CNT_DAG: usize = ETHASH_ACCESSES;
const PROGPOW_DAG_LOADS: usize = 4;
const PROGPOW_MIX_BYTES: usize = 2 * ETHASH_MIX_BYTES;
const PROGPOW_PERIOD_LENGTH: usize = 10; // blocks per progpow epoch (N)
const PROGPOW_LANES: usize = 16;
const PROGPOW_REGS: usize = 32;

const FNV_HASH: u32 = 0x811c9dc5;

const KECCAKF_RNDC: [u32; 24] = [
	0x00000001, 0x00008082, 0x0000808a, 0x80008000, 0x0000808b, 0x80000001,
	0x80008081, 0x00008009, 0x0000008a, 0x00000088, 0x80008009, 0x8000000a,
	0x8000808b, 0x0000008b, 0x00008089, 0x00008003, 0x00008002, 0x00000080,
	0x0000800a, 0x8000000a, 0x80008081, 0x00008080, 0x80000001, 0x80008008
];

const KECCAKF_ROTC: [u32; 24] = [
	1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
	27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
];

const KECCAKF_PILN: [usize; 24] = [
	10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
	15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
];

fn keccak_f800_round(st: &mut [u32; 25], r: usize) {
	// Theta
	let mut bc = [0u32; 5];
	for i in 0..bc.len() {
		bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
	}

	for i in 0..bc.len() {
		let t = bc[(i + 4) % 5] ^ bc[(i + 1) % 5].rotate_left(1);
		for j in (0..st.len()).step_by(5) {
			st[j + i] ^= t;
		}
	}

	// Rho Pi
	let mut t = st[1];

	debug_assert_eq!(KECCAKF_ROTC.len(), 24);
	for i in 0..24 {
		let j = KECCAKF_PILN[i];
		bc[0] = st[j];
		st[j] = t.rotate_left(KECCAKF_ROTC[i]);
		t = bc[0];
	}

	// Chi
	for j in (0..st.len()).step_by(5) {
		for i in 0..bc.len() {
			bc[i] = st[j + i];
		}
		for i in 0..bc.len() {
			st[j + i] ^= (!bc[(i + 1) % 5]) & bc[(i + 2) % 5];
		}
	}

	// Iota
	debug_assert!(r < KECCAKF_RNDC.len());
	st[0] ^= KECCAKF_RNDC[r];
}

fn keccak_f800(header_hash: H256, nonce: u64, result: [u32; 8], st: &mut [u32; 25]) {
	for i in 0..8 {
		st[i] = (header_hash[4 * i] as u32) +
			((header_hash[4 * i + 1] as u32) << 8) +
			((header_hash[4 * i + 2] as u32) << 16) +
			((header_hash[4 * i + 3] as u32) << 24);
	}

	st[8] = nonce as u32;
	st[9] = (nonce >> 32) as u32;

	for i in 0..8 {
		st[10 + i] = result[i];
	}

	for r in 0..22 {
		keccak_f800_round(st, r);
	}
}

pub fn keccak_f800_short(header_hash: H256, nonce: u64, result: [u32; 8]) -> u64 {
	let mut st = [0u32; 25];
	keccak_f800(header_hash, nonce, result, &mut st);
	(st[0].swap_bytes() as u64) << 32 | st[1].swap_bytes() as u64
}

pub fn keccak_f800_long(header_hash: H256, nonce: u64, result: [u32; 8]) -> H256 {
	let mut st = [0u32; 25];
	keccak_f800(header_hash, nonce, result, &mut st);

	// NOTE: transmute from `[u32; 8]` to `[u8; 32]`
	unsafe {
		std::mem::transmute(
			[st[0], st[1], st[2], st[3], st[4], st[5], st[6], st[7]]
		)
	}
}

#[inline]
fn fnv1a_hash(h: u32, d: u32) -> u32 {
	(h ^ d).wrapping_mul(FNV_PRIME)
}

#[derive(Clone)]
struct Kiss99 {
	z: u32,
	w: u32,
	jsr: u32,
	jcong: u32,
}

impl Kiss99 {
	fn new(z: u32, w: u32, jsr: u32, jcong: u32) -> Kiss99 {
		Kiss99 { z, w, jsr, jcong }
	}

	#[inline]
	fn next_u32(&mut self) -> u32 {
		self.z = 36969u32.wrapping_mul(self.z & 65535).wrapping_add(self.z >> 16);
		self.w = 18000u32.wrapping_mul(self.w & 65535).wrapping_add(self.w >> 16);
		let mwc = (self.z << 16).wrapping_add(self.w);
		self.jsr ^= self.jsr << 17;
		self.jsr ^= self.jsr >> 13;
		self.jsr ^= self.jsr << 5;
		self.jcong = 69069u32.wrapping_mul(self.jcong).wrapping_add(1234567);

		(mwc ^ self.jcong).wrapping_add(self.jsr)
	}
}

fn fill_mix(seed: u64, lane_id: u32) -> [u32; PROGPOW_REGS] {
	// Use FNV to expand the per-warp seed to per-lane
	// Use KISS to expand the per-lane seed to fill mix
	let z = fnv1a_hash(FNV_HASH, seed as u32);
	let w = fnv1a_hash(z, (seed >> 32) as u32);
	let jsr = fnv1a_hash(w, lane_id);
	let jcong = fnv1a_hash(jsr, lane_id);

	let mut rnd = Kiss99::new(z, w, jsr, jcong);

	let mut mix = [0; PROGPOW_REGS];

	debug_assert_eq!(PROGPOW_REGS, 32);
	for i in 0..32 {
		mix[i] = rnd.next_u32();
	}

	mix
}

// Merge new data from b into the value in a. Assuming A has high entropy only
// do ops that retain entropy even if B is low entropy (IE don't do A&B)
fn merge(a: u32, b: u32, r: u32) -> u32 {
	match r % 4 {
		0 => a.wrapping_mul(33).wrapping_add(b),
		1 => (a ^ b).wrapping_mul(33),
		2 => a.rotate_left(((r >> 16) % 31) + 1) ^ b,
		_ => a.rotate_right(((r >> 16) % 31) + 1) ^ b,
	}
}

fn math(a: u32, b: u32, r: u32) -> u32 {
	match r % 11 {
		0 => a.wrapping_add(b),
		1 => a.wrapping_mul(b),
		2 => ((a as u64).wrapping_mul(b as u64) >> 32) as u32,
		3 => a.min(b),
		4 => a.rotate_left(b),
		5 => a.rotate_right(b),
		6 => a & b,
		7 => a | b,
		8 => a ^ b,
		9 => a.leading_zeros() + b.leading_zeros(),
		_ => a.count_ones() + b.count_ones(),
	}
}

fn progpow_init(seed: u64) -> (Kiss99, [u32; PROGPOW_REGS], [u32; PROGPOW_REGS]) {
	let z = fnv1a_hash(FNV_HASH, seed as u32);
	let w = fnv1a_hash(z, (seed >> 32) as u32);
	let jsr = fnv1a_hash(w, seed as u32);
	let jcong = fnv1a_hash(jsr, (seed >> 32) as u32);

	let mut rnd = Kiss99::new(z, w, jsr, jcong);

	// Create a random sequence of mix destinations for merge() and mix sources
	// for cache reads guarantees every destination merged once and guarantees
	// no duplicate cache reads, which could be optimized away. Uses
	// Fisher-Yates shuffle.
	let mut mix_seq_dst = [0u32; PROGPOW_REGS];
	let mut mix_seq_cache = [0u32; PROGPOW_REGS];
	for i in 0..mix_seq_dst.len() {
		mix_seq_dst[i] = i as u32;
		mix_seq_cache[i] = i as u32;
	}

	for i in (1..mix_seq_dst.len()).rev() {
		let j = rnd.next_u32() as usize % (i + 1);
		mix_seq_dst.swap(i, j);

		let j = rnd.next_u32() as usize % (i + 1);
		mix_seq_cache.swap(i, j);
	}

	(rnd, mix_seq_dst, mix_seq_cache)
}

pub type CDag = [u32; PROGPOW_CACHE_WORDS];

fn progpow_loop(
	seed: u64,
	loop_: usize,
	mix: &mut [[u32; PROGPOW_REGS]; PROGPOW_LANES],
	cache: &[Node],
	c_dag: &CDag,
	data_size: usize,
) {
	// All lanes share a base address for the global load. Global offset uses
	// mix[0] to guarantee it depends on the load result.
	let g_offset = mix[loop_ % PROGPOW_LANES][0] as usize %
		(64 * data_size / (PROGPOW_LANES * PROGPOW_DAG_LOADS));

	// 256 bytes of dag data
	let mut dag_item = [0u32; 64];

	// Fetch DAG nodes (64 bytes each)
	for l in 0..PROGPOW_DAG_LOADS {
		let index = g_offset * PROGPOW_LANES * PROGPOW_DAG_LOADS + l * 16;
		let node = calculate_dag_item(index as u32 / 16, cache);
		dag_item[l * 16..(l + 1) * 16].clone_from_slice(node.as_words());
	}

	let (rnd, mix_seq_dst, mix_seq_cache) = progpow_init(seed);

	// Lanes can execute in parallel and will be convergent
	for l in 0..mix.len() {
		let mut rnd = rnd.clone();

		// Initialize the seed and mix destination sequence
		let mut mix_seq_dst_cnt = 0;
		let mut mix_seq_cache_cnt = 0;

		let mut mix_dst = || {
			let res = mix_seq_dst[mix_seq_dst_cnt % PROGPOW_REGS] as usize;
			mix_seq_dst_cnt += 1;
			res
		};
		let mut mix_cache = || {
			let res = mix_seq_cache[mix_seq_cache_cnt % PROGPOW_REGS] as usize;
			mix_seq_cache_cnt += 1;
			res
		};

		for i in 0..PROGPOW_CNT_CACHE.max(PROGPOW_CNT_MATH) {
			if i < PROGPOW_CNT_CACHE {
				// Cached memory access, lanes access random 32-bit locations
				// within the first portion of the DAG
				let offset = mix[l][mix_cache()] as usize % PROGPOW_CACHE_WORDS;
				let data = c_dag[offset];
				let dst = mix_dst();

				mix[l][dst] = merge(mix[l][dst], data, rnd.next_u32());
			}

			if i < PROGPOW_CNT_MATH {
				// Random math
				// Generate 2 unique sources
				let src_rnd = rnd.next_u32() % (PROGPOW_REGS * (PROGPOW_REGS - 1)) as u32;
				let src1 = src_rnd % PROGPOW_REGS as u32; // 0 <= src1 < PROGPOW_REGS
				let mut src2 = src_rnd / PROGPOW_REGS as u32; // 0 <= src2 < PROGPOW_REGS - 1
				if src2 >= src1 {
					src2 += 1; // src2 is now any reg other than src1
				}

				let data = math(mix[l][src1 as usize], mix[l][src2 as usize], rnd.next_u32());
				let dst = mix_dst();

				mix[l][dst] = merge(mix[l][dst], data, rnd.next_u32());
			}
		}

		// Global load to sequential locations
		let mut data_g = [0u32; PROGPOW_DAG_LOADS];
		let index = ((l ^ loop_) % PROGPOW_LANES) * PROGPOW_DAG_LOADS;
		for i in 0..PROGPOW_DAG_LOADS {
			data_g[i] = dag_item[index + i];
		}

		// Consume the global load data at the very end of the loop to allow
		// full latency hiding. Always merge into `mix[0]` to feed the offset
		// calculation.
		mix[l][0] = merge(mix[l][0], data_g[0], rnd.next_u32());
		for i in 1..PROGPOW_DAG_LOADS {
			let dst = mix_dst();
			mix[l][dst] = merge(mix[l][dst], data_g[i], rnd.next_u32());
		}
	}
}

pub fn progpow(
	header_hash: H256,
	nonce: u64,
	block_number: u64,
	cache: &[Node],
	c_dag: &CDag,
) -> (H256, H256) {
	let mut mix = [[0u32; PROGPOW_REGS]; PROGPOW_LANES];
	let mut lane_results = [0u32; PROGPOW_LANES];
	let mut result = [0u32; 8];

	let data_size = get_data_size(block_number) / PROGPOW_MIX_BYTES;

	// NOTE: This assert is required to aid the optimizer elide the non-zero
	// remainder check in `progpow_loop`.
	assert!(data_size > 0);

	// Initialize mix for all lanes
	let seed = keccak_f800_short(header_hash, nonce, result);

	for l in 0..mix.len() {
		mix[l] = fill_mix(seed, l as u32);
	}

	// Execute the randomly generated inner loop
	let period = block_number / PROGPOW_PERIOD_LENGTH as u64;
	for i in 0..PROGPOW_CNT_DAG {
		progpow_loop(
			period,
			i,
			&mut mix,
			cache,
			c_dag,
			data_size,
		);
	}

	// Reduce mix data to a single per-lane result
	for l in 0..lane_results.len() {
		lane_results[l] = FNV_HASH;
		for i in 0..PROGPOW_REGS {
			lane_results[l] = fnv1a_hash(lane_results[l], mix[l][i]);
		}
	}

	// Reduce all lanes to a single 128-bit result
	result = [FNV_HASH; 8];
	for l in 0..PROGPOW_LANES {
		result[l % 8] = fnv1a_hash(result[l % 8], lane_results[l]);
	}

	let digest = keccak_f800_long(header_hash, seed, result);

	// NOTE: transmute from `[u32; 8]` to `[u8; 32]`
	let result = unsafe { ::std::mem::transmute(result) };

	(digest, result)
}

pub fn generate_cdag(cache: &[Node]) -> CDag {
	let mut c_dag = [0u32; PROGPOW_CACHE_WORDS];

	for i in 0..PROGPOW_CACHE_WORDS / 16 {
		let node = calculate_dag_item(i as u32, cache);
		for j in 0..16 {
			c_dag[i * 16 + j] = node.as_words()[j];
		}
	}

	c_dag
}

#[cfg(test)]
mod test {
	use tempdir::TempDir;

	use common_types::engines::OptimizeFor;
	use cache::NodeCacheBuilder;
	use keccak::H256;
	use rustc_hex::FromHex;
	use serde_json::{self, Value};
	use std::collections::VecDeque;
	use super::*;

	fn h256(hex: &str) -> H256 {
		let bytes = FromHex::from_hex(hex).unwrap();
		let mut res = [0; 32];
		res.copy_from_slice(&bytes);
		res
	}

	#[test]
	fn test_cdag() {
		let builder = NodeCacheBuilder::new(OptimizeFor::Memory, u64::max_value());
		let tempdir = TempDir::new("").unwrap();
		let cache = builder.new_cache(tempdir.into_path(), 0);

		let c_dag = generate_cdag(cache.as_ref());

		let expected = vec![
			690150178u32, 1181503948, 2248155602, 2118233073, 2193871115,
			1791778428, 1067701239, 724807309, 530799275, 3480325829, 3899029234,
			1998124059, 2541974622, 1100859971, 1297211151, 3268320000, 2217813733,
			2690422980, 3172863319, 2651064309
		];

		assert_eq!(
			c_dag.iter().take(20).cloned().collect::<Vec<_>>(),
			expected,
		);
	}

	#[test]
	fn test_random_merge() {
		let tests = [
			(1000000u32, 101u32, 33000101u32),
			(2000000, 102, 66003366),
			(3000000, 103, 6000103),
			(4000000, 104, 2000104),
			(1000000, 0, 33000000),
			(2000000, 0, 66000000),
			(3000000, 0, 6000000),
			(4000000, 0, 2000000),
		];

		for (i, &(a, b, expected)) in tests.iter().enumerate() {
			assert_eq!(
				merge(a, b, i as u32),
				expected,
			);
		}
	}

	#[test]
	fn test_random_math() {
		let tests = [
			(20u32, 22u32, 42u32),
			(70000, 80000, 1305032704),
			(70000, 80000, 1),
			(1, 2, 1),
			(3, 10000, 196608),
			(3, 0, 3),
			(3, 6, 2),
			(3, 6, 7),
			(3, 6, 5),
			(0, 0xffffffff, 32),
			(3 << 13, 1 << 5, 3),
			(22, 20, 42),
			(80000, 70000, 1305032704),
			(80000, 70000, 1),
			(2, 1, 1),
			(10000, 3, 80000),
			(0, 3, 0),
			(6, 3, 2),
			(6, 3, 7),
			(6, 3, 5),
			(0, 0xffffffff, 32),
			(3 << 13, 1 << 5, 3),
		];

		for (i, &(a, b, expected)) in tests.iter().enumerate() {
			assert_eq!(
				math(a, b, i as u32),
				expected,
			);
		}
	}

	#[test]
	fn test_keccak_256() {
		let expected = "5dd431e5fbc604f499bfa0232f45f8f142d0ff5178f539e5a7800bf0643697af";
		assert_eq!(
			keccak_f800_long([0; 32], 0, [0; 8]),
			h256(expected),
		);
	}

	#[test]
	fn test_keccak_64() {
		let expected: u64 = 0x5dd431e5fbc604f4;
		assert_eq!(
			keccak_f800_short([0; 32], 0, [0; 8]),
			expected,
		);
	}

	#[test]
	fn test_progpow_hash() {
		let builder = NodeCacheBuilder::new(OptimizeFor::Memory, u64::max_value());
		let tempdir = TempDir::new("").unwrap();
		let cache = builder.new_cache(tempdir.into_path(), 0);
		let c_dag = generate_cdag(cache.as_ref());

		let header_hash = [0; 32];

		let (digest, result) = progpow(
			header_hash,
			0,
			0,
			cache.as_ref(),
			&c_dag,
		);

		let expected_digest = FromHex::from_hex("b3bad9ca6f7c566cf0377d1f8cce29d6516a96562c122d924626281ec948ef02").unwrap();
		let expected_result = FromHex::from_hex("f4ac202715ded4136e72887c39e63a4738331c57fd9eb79f6ec421c281aa8743").unwrap();

		assert_eq!(
			digest.to_vec(),
			expected_digest,
		);

		assert_eq!(
			result.to_vec(),
			expected_result,
		);
	}

	#[test]
	fn test_progpow_testvectors() {
		struct ProgpowTest {
			block_number: u64,
			header_hash: H256,
			nonce: u64,
			mix_hash: H256,
			final_hash: H256,
		}

		let tests: Vec<VecDeque<Value>> =
			serde_json::from_slice(include_bytes!("../res/progpow_testvectors.json")).unwrap();

		let tests: Vec<ProgpowTest> = tests.into_iter().map(|mut test: VecDeque<Value>| {
			assert!(test.len() == 5);

			let block_number: u64 = serde_json::from_value(test.pop_front().unwrap()).unwrap();
			let header_hash: String = serde_json::from_value(test.pop_front().unwrap()).unwrap();
			let nonce: String = serde_json::from_value(test.pop_front().unwrap()).unwrap();
			let mix_hash: String = serde_json::from_value(test.pop_front().unwrap()).unwrap();
			let final_hash: String = serde_json::from_value(test.pop_front().unwrap()).unwrap();

			ProgpowTest {
				block_number,
				header_hash: h256(&header_hash),
				nonce: u64::from_str_radix(&nonce, 16).unwrap(),
				mix_hash: h256(&mix_hash),
				final_hash: h256(&final_hash),
			}
		}).collect();

		for test in tests {
			let builder = NodeCacheBuilder::new(OptimizeFor::Memory, u64::max_value());
			let tempdir = TempDir::new("").unwrap();
			let cache = builder.new_cache(tempdir.path().to_owned(), test.block_number);
			let c_dag = generate_cdag(cache.as_ref());

			let (digest, result) = progpow(
				test.header_hash,
				test.nonce,
				test.block_number,
				cache.as_ref(),
				&c_dag,
			);

			assert_eq!(digest, test.final_hash);
			assert_eq!(result, test.mix_hash);
		}
	}
}