openethereum/ethcore/src/snapshot/mod.rs

// Copyright 2015-2019 Parity Technologies (UK) Ltd.
// This file is part of Parity Ethereum.

// Parity Ethereum is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Parity Ethereum is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Parity Ethereum.  If not, see <http://www.gnu.org/licenses/>.

//! Snapshot creation, restoration, and network service.
//!
//! Documentation of the format can be found at
//! https://wiki.parity.io/Warp-Sync-Snapshot-Format

use std::collections::{HashMap, HashSet};
use std::cmp;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering};
use hash::{keccak, KECCAK_NULL_RLP, KECCAK_EMPTY};

use account_db::{AccountDB, AccountDBMut};
use blockchain::{BlockChain, BlockProvider};
use engines::EthEngine;
use types::header::Header;
use types::ids::BlockId;

use ethereum_types::{H256, U256};
use hash_db::HashDB;
use keccak_hasher::KeccakHasher;
use snappy;
use bytes::Bytes;
use parking_lot::Mutex;
use journaldb::{self, Algorithm, JournalDB};
use kvdb::{KeyValueDB, DBValue};
use trie::{Trie, TrieMut};
use ethtrie::{TrieDB, TrieDBMut};
use rlp::{RlpStream, Rlp};
use bloom_journal::Bloom;
use num_cpus;

use self::io::SnapshotWriter;

use super::state_db::StateDB;
use super::state::Account as StateAccount;

use crossbeam_utils::thread;
use rand::{Rng, OsRng};

pub use self::error::Error;

pub use self::consensus::*;
pub use self::service::{SnapshotClient, Service, DatabaseRestore};
pub use self::traits::SnapshotService;
pub use self::watcher::Watcher;
pub use types::snapshot_manifest::ManifestData;
pub use types::restoration_status::RestorationStatus;
pub use types::basic_account::BasicAccount;

pub mod io;
pub mod service;

mod account;
mod block;
mod consensus;
mod error;
mod watcher;

#[cfg(test)]
mod tests;

mod traits;

// Try to have chunks be around 4MB (before compression)
const PREFERRED_CHUNK_SIZE: usize = 4 * 1024 * 1024;

// Maximal chunk size (decompressed)
// Snappy::decompressed_len estimation may sometimes yield results greater
// than PREFERRED_CHUNK_SIZE so allow some threshold here.
const MAX_CHUNK_SIZE: usize = PREFERRED_CHUNK_SIZE / 4 * 5;

// Minimum supported state chunk version.
const MIN_SUPPORTED_STATE_CHUNK_VERSION: u64 = 1;
// current state chunk version.
const STATE_CHUNK_VERSION: u64 = 2;
/// number of snapshot subparts, must be a power of 2 in [1; 256]
const SNAPSHOT_SUBPARTS: usize = 16;
/// Maximum number of snapshot subparts (must be a multiple of `SNAPSHOT_SUBPARTS`)
const MAX_SNAPSHOT_SUBPARTS: usize = 256;

/// Configuration for the Snapshot service
#[derive(Debug, Clone, PartialEq)]
pub struct SnapshotConfiguration {
	/// If `true`, no periodic snapshots will be created
	pub no_periodic: bool,
	/// Number of threads for creating snapshots
	pub processing_threads: usize,
}

impl Default for SnapshotConfiguration {
	fn default() -> Self {
		SnapshotConfiguration {
			no_periodic: false,
			processing_threads: ::std::cmp::max(1, num_cpus::get_physical() / 2),
		}
	}
}

/// A progress indicator for snapshots.
#[derive(Debug, Default)]
pub struct Progress {
	accounts: AtomicUsize,
	blocks: AtomicUsize,
	size: AtomicU64,
	done: AtomicBool,
	abort: AtomicBool,
}

impl Progress {
	/// Reset the progress.
	pub fn reset(&self) {
		self.accounts.store(0, Ordering::Release);
		self.blocks.store(0, Ordering::Release);
		self.size.store(0, Ordering::Release);
		self.abort.store(false, Ordering::Release);

		// atomic fence here to ensure the others are written first?
		// logs might very rarely get polluted if not.
		self.done.store(false, Ordering::Release);
	}

	/// Get the number of accounts snapshotted thus far.
	pub fn accounts(&self) -> usize { self.accounts.load(Ordering::Acquire) }

	/// Get the number of blocks snapshotted thus far.
	pub fn blocks(&self) -> usize { self.blocks.load(Ordering::Acquire) }

	/// Get the written size of the snapshot in bytes.
	pub fn size(&self) -> u64 { self.size.load(Ordering::Acquire) }

	/// Whether the snapshot is complete.
	pub fn done(&self) -> bool  { self.done.load(Ordering::Acquire) }

}
/// Take a snapshot using the given blockchain, starting block hash, and database, writing into the given writer.
pub fn take_snapshot<W: SnapshotWriter + Send>(
	chunker: Box<dyn SnapshotComponents>,
	chain: &BlockChain,
	block_hash: H256,
	state_db: &dyn HashDB<KeccakHasher, DBValue>,
	writer: W,
	p: &Progress,
	processing_threads: usize,
) -> Result<(), Error> {
	let start_header = chain.block_header_data(&block_hash)
		.ok_or_else(|| Error::InvalidStartingBlock(BlockId::Hash(block_hash)))?;
	let state_root = start_header.state_root();
	let block_number = start_header.number();

	info!("Taking snapshot starting at block {}", block_number);

	let version = chunker.current_version();
	let writer = Mutex::new(writer);
	let (state_hashes, block_hashes) = thread::scope(|scope| -> Result<(Vec<H256>, Vec<H256>), Error> {
		let writer = &writer;
		let block_guard = scope.spawn(move |_| {
			chunk_secondary(chunker, chain, block_hash, writer, p)
		});

		// The number of threads must be between 1 and SNAPSHOT_SUBPARTS
		assert!(processing_threads >= 1, "Cannot use less than 1 threads for creating snapshots");
		let num_threads: usize = cmp::min(processing_threads, SNAPSHOT_SUBPARTS);
		info!(target: "snapshot", "Using {} threads for Snapshot creation.", num_threads);

		let mut state_guards = Vec::with_capacity(num_threads as usize);

		for thread_idx in 0..num_threads {
			let state_guard = scope.spawn(move |_| -> Result<Vec<H256>, Error> {
				let mut chunk_hashes = Vec::new();

				for part in (thread_idx..SNAPSHOT_SUBPARTS).step_by(num_threads) {
					debug!(target: "snapshot", "Chunking part {} in thread {}", part, thread_idx);
					let mut hashes = chunk_state(state_db, &state_root, writer, p, Some(part), thread_idx)?;
					chunk_hashes.append(&mut hashes);
				}

				Ok(chunk_hashes)
			});
			state_guards.push(state_guard);
		}

		let block_hashes = block_guard.join().expect("Sub-thread never panics; qed")?;
		let mut state_hashes = Vec::new();

		for guard in state_guards {
			let part_state_hashes = guard.join().expect("Sub-thread never panics; qed")?;
			state_hashes.extend(part_state_hashes);
		}

		debug!(target: "snapshot", "Took a snapshot of {} accounts", p.accounts.load(Ordering::SeqCst));
		Ok((state_hashes, block_hashes))
	}).expect("Sub-thread never panics; qed")?;

	info!(target: "snapshot", "produced {} state chunks and {} block chunks.", state_hashes.len(), block_hashes.len());

	let manifest_data = ManifestData {
		version,
		state_hashes,
		block_hashes,
		state_root,
		block_number,
		block_hash,
	};

	writer.into_inner().finish(manifest_data)?;

	p.done.store(true, Ordering::SeqCst);

	Ok(())
}

/// Create and write out all secondary chunks to disk, returning a vector of all
/// the hashes of secondary chunks created.
///
/// Secondary chunks are engine-specific, but they intend to corroborate the state data
/// in the state chunks.
/// Returns a list of chunk hashes, with the first having the blocks furthest from the genesis.
pub fn chunk_secondary<'a>(
	mut chunker: Box<dyn SnapshotComponents>,
	chain: &'a BlockChain,
	start_hash: H256,
	writer: &Mutex<dyn SnapshotWriter + 'a>,
	progress: &'a Progress
) -> Result<Vec<H256>, Error> {
	let mut chunk_hashes = Vec::new();
	let mut snappy_buffer = vec![0; snappy::max_compressed_len(PREFERRED_CHUNK_SIZE)];

	{
		let mut chunk_sink = |raw_data: &[u8]| {
			let compressed_size = snappy::compress_into(raw_data, &mut snappy_buffer);
			let compressed = &snappy_buffer[..compressed_size];
			let hash = keccak(&compressed);
			let size = compressed.len();

			writer.lock().write_block_chunk(hash, compressed)?;
			trace!(target: "snapshot", "wrote secondary chunk. hash: {:x}, size: {}, uncompressed size: {}",
				hash, size, raw_data.len());

			progress.size.fetch_add(size as u64, Ordering::SeqCst);
			chunk_hashes.push(hash);
			Ok(())
		};

		chunker.chunk_all(
			chain,
			start_hash,
			&mut chunk_sink,
			progress,
			PREFERRED_CHUNK_SIZE,
		)?;
	}

	Ok(chunk_hashes)
}

/// State trie chunker.
struct StateChunker<'a> {
	hashes: Vec<H256>,
	rlps: Vec<Bytes>,
	cur_size: usize,
	snappy_buffer: Vec<u8>,
	writer: &'a Mutex<dyn SnapshotWriter + 'a>,
	progress: &'a Progress,
	thread_idx: usize,
}

impl<'a> StateChunker<'a> {
	// Push a key, value pair to be encoded.
	//
	// If the buffer is greater than the desired chunk size,
	// this will write out the data to disk.
	fn push(&mut self, data: Bytes) -> Result<(), Error> {
		self.cur_size += data.len();
		self.rlps.push(data);
		Ok(())
	}

	// Write out the buffer to disk, pushing the created chunk's hash to
	// the list.
	fn write_chunk(&mut self) -> Result<(), Error> {
		let num_entries = self.rlps.len();
		let mut stream = RlpStream::new_list(num_entries);
		for rlp in self.rlps.drain(..) {
			stream.append_raw(&rlp, 1);
		}

		let raw_data = stream.out();

		let compressed_size = snappy::compress_into(&raw_data, &mut self.snappy_buffer);
		let compressed = &self.snappy_buffer[..compressed_size];
		let hash = keccak(&compressed);

		self.writer.lock().write_state_chunk(hash, compressed)?;
		trace!(target: "snapshot", "Thread {} wrote state chunk. size: {}, uncompressed size: {}", self.thread_idx, compressed_size, raw_data.len());

		self.progress.accounts.fetch_add(num_entries, Ordering::SeqCst);
		self.progress.size.fetch_add(compressed_size as u64, Ordering::SeqCst);

		self.hashes.push(hash);
		self.cur_size = 0;

		Ok(())
	}

	// Get current chunk size.
	fn chunk_size(&self) -> usize {
		self.cur_size
	}
}

/// Walk the given state database starting from the given root,
/// creating chunks and writing them out.
/// `part` is a number between 0 and 15, which describe which part of
/// the tree should be chunked.
///
/// Returns a list of hashes of chunks created, or any error it may
/// have encountered.
pub fn chunk_state<'a>(
	db: &dyn HashDB<KeccakHasher, DBValue>,
	root: &H256,
	writer: &Mutex<dyn SnapshotWriter + 'a>,
	progress: &'a Progress,
	part: Option<usize>,
	thread_idx: usize,
) -> Result<Vec<H256>, Error> {
	let account_trie = TrieDB::new(&db, &root)?;

	let mut chunker = StateChunker {
		hashes: Vec::new(),
		rlps: Vec::new(),
		cur_size: 0,
		snappy_buffer: vec![0; snappy::max_compressed_len(PREFERRED_CHUNK_SIZE)],
		writer,
		progress,
		thread_idx,
	};

	let mut used_code = HashSet::new();

	// account_key here is the address' hash.
	let mut account_iter = account_trie.iter()?;

	let mut seek_to = None;

	if let Some(part) = part {
		assert!(part < 16, "Wrong chunk state part number (must be <16) in snapshot creation.");

		let part_offset = MAX_SNAPSHOT_SUBPARTS / SNAPSHOT_SUBPARTS;
		let mut seek_from = vec![0; 32];
		seek_from[0] = (part * part_offset) as u8;
		account_iter.seek(&seek_from)?;

		// Set the upper-bound, except for the last part
		if part < SNAPSHOT_SUBPARTS - 1 {
			seek_to = Some(((part + 1) * part_offset) as u8)
		}
	}

	for item in account_iter {
		let (account_key, account_data) = item?;
		let account_key_hash = H256::from_slice(&account_key);

		if seek_to.map_or(false, |seek_to| account_key[0] >= seek_to) {
			break;
		}

		let account = ::rlp::decode(&*account_data)?;
		let account_db = AccountDB::from_hash(db, account_key_hash);

		let fat_rlps = account::to_fat_rlps(&account_key_hash, &account, &account_db, &mut used_code, PREFERRED_CHUNK_SIZE - chunker.chunk_size(), PREFERRED_CHUNK_SIZE, progress)?;
		for (i, fat_rlp) in fat_rlps.into_iter().enumerate() {
			if i > 0 {
				chunker.write_chunk()?;
			}
			chunker.push(fat_rlp)?;
		}
	}

	if chunker.cur_size != 0 {
		chunker.write_chunk()?;
	}

	Ok(chunker.hashes)
}

/// Used to rebuild the state trie piece by piece.
pub struct StateRebuilder {
	db: Box<dyn JournalDB>,
	state_root: H256,
	known_code: HashMap<H256, H256>, // code hashes mapped to first account with this code.
	missing_code: HashMap<H256, Vec<H256>>, // maps code hashes to lists of accounts missing that code.
	bloom: Bloom,
	known_storage_roots: HashMap<H256, H256>, // maps account hashes to last known storage root. Only filled for last account per chunk.
}

impl StateRebuilder {
	/// Create a new state rebuilder to write into the given backing DB.
	pub fn new(db: Arc<dyn KeyValueDB>, pruning: Algorithm) -> Self {
		StateRebuilder {
			db: journaldb::new(db.clone(), pruning, ::db::COL_STATE),
			state_root: KECCAK_NULL_RLP,
			known_code: HashMap::new(),
			missing_code: HashMap::new(),
			bloom: StateDB::load_bloom(&*db),
			known_storage_roots: HashMap::new(),
		}
	}

	/// Feed an uncompressed state chunk into the rebuilder.
	pub fn feed(&mut self, chunk: &[u8], flag: &AtomicBool) -> Result<(), ::error::Error> {
		let rlp = Rlp::new(chunk);
		let empty_rlp = StateAccount::new_basic(U256::zero(), U256::zero()).rlp();
		let mut pairs = Vec::with_capacity(rlp.item_count()?);

		// initialize the pairs vector with empty values so we have slots to write into.
		pairs.resize(rlp.item_count()?, (H256::zero(), Vec::new()));

		let status = rebuild_accounts(
			self.db.as_hash_db_mut(),
			rlp,
			&mut pairs,
			&self.known_code,
			&mut self.known_storage_roots,
			flag
		)?;

		for (addr_hash, code_hash) in status.missing_code {
			self.missing_code.entry(code_hash).or_insert_with(Vec::new).push(addr_hash);
		}

		// patch up all missing code. must be done after collecting all new missing code entries.
		for (code_hash, code, first_with) in status.new_code {
			for addr_hash in self.missing_code.remove(&code_hash).unwrap_or_else(Vec::new) {
				let mut db = AccountDBMut::from_hash(self.db.as_hash_db_mut(), addr_hash);
				db.emplace(code_hash, DBValue::from_slice(&code));
			}

			self.known_code.insert(code_hash, first_with);
		}

		let backing = self.db.backing().clone();

		// batch trie writes
		{
			let mut account_trie = if self.state_root != KECCAK_NULL_RLP {
				TrieDBMut::from_existing(self.db.as_hash_db_mut(), &mut self.state_root)?
			} else {
				TrieDBMut::new(self.db.as_hash_db_mut(), &mut self.state_root)
			};

			for (hash, thin_rlp) in pairs {
				if !flag.load(Ordering::SeqCst) { return Err(Error::RestorationAborted.into()) }

				if &thin_rlp[..] != &empty_rlp[..] {
					self.bloom.set(&*hash);
				}
				account_trie.insert(&hash, &thin_rlp)?;
			}
		}

		let bloom_journal = self.bloom.drain_journal();
		let mut batch = backing.transaction();
		StateDB::commit_bloom(&mut batch, bloom_journal)?;
		self.db.inject(&mut batch)?;
		backing.write_buffered(batch);
		trace!(target: "snapshot", "current state root: {:?}", self.state_root);
		Ok(())
	}

	/// Finalize the restoration. Check for accounts missing code and make a dummy
	/// journal entry.
	/// Once all chunks have been fed, there should be nothing missing.
	pub fn finalize(mut self, era: u64, id: H256) -> Result<Box<dyn JournalDB>, ::error::Error> {
		let missing = self.missing_code.keys().cloned().collect::<Vec<_>>();
		if !missing.is_empty() { return Err(Error::MissingCode(missing).into()) }

		let mut batch = self.db.backing().transaction();
		self.db.journal_under(&mut batch, era, &id)?;
		self.db.backing().write_buffered(batch);

		Ok(self.db)
	}

	/// Get the state root of the rebuilder.
	pub fn state_root(&self) -> H256 { self.state_root }
}

#[derive(Default)]
struct RebuiltStatus {
	// new code that's become available. (code_hash, code, addr_hash)
	new_code: Vec<(H256, Bytes, H256)>,
	missing_code: Vec<(H256, H256)>, // accounts that are missing code.
}

// rebuild a set of accounts and their storage.
// returns a status detailing newly-loaded code and accounts missing code.
fn rebuild_accounts(
	db: &mut dyn HashDB<KeccakHasher, DBValue>,
	account_fat_rlps: Rlp,
	out_chunk: &mut [(H256, Bytes)],
	known_code: &HashMap<H256, H256>,
	known_storage_roots: &mut HashMap<H256, H256>,
	abort_flag: &AtomicBool,
) -> Result<RebuiltStatus, ::error::Error> {
	let mut status = RebuiltStatus::default();
	for (account_rlp, out) in account_fat_rlps.into_iter().zip(out_chunk.iter_mut()) {
		if !abort_flag.load(Ordering::SeqCst) { return Err(Error::RestorationAborted.into()) }

		let hash: H256 = account_rlp.val_at(0)?;
		let fat_rlp = account_rlp.at(1)?;

		let thin_rlp = {

			// fill out the storage trie and code while decoding.
			let (acc, maybe_code) = {
				let mut acct_db = AccountDBMut::from_hash(db, hash);
				let storage_root = known_storage_roots.get(&hash).cloned().unwrap_or_default();
				account::from_fat_rlp(&mut acct_db, fat_rlp, storage_root)?
			};

			let code_hash = acc.code_hash.clone();
			match maybe_code {
				// new inline code
				Some(code) => status.new_code.push((code_hash, code, hash)),
				None => {
					if code_hash != KECCAK_EMPTY {
						// see if this code has already been included inline
						match known_code.get(&code_hash) {
							Some(&first_with) => {
								// if so, load it from the database.
								let code = AccountDB::from_hash(db, first_with)
									.get(&code_hash)
									.ok_or_else(|| Error::MissingCode(vec![first_with]))?;

								// and write it again under a different mangled key
								AccountDBMut::from_hash(db, hash).emplace(code_hash, code);
							}
							// if not, queue it up to be filled later
							None => status.missing_code.push((hash, code_hash)),
						}
					}
				}
			}

			::rlp::encode(&acc)
		};

		*out = (hash, thin_rlp);
	}
	if let Some(&(ref hash, ref rlp)) = out_chunk.iter().last() {
		known_storage_roots.insert(*hash, ::rlp::decode::<BasicAccount>(rlp)?.storage_root);
	}
	if let Some(&(ref hash, ref rlp)) = out_chunk.iter().next() {
		known_storage_roots.insert(*hash, ::rlp::decode::<BasicAccount>(rlp)?.storage_root);
	}
	Ok(status)
}

/// Proportion of blocks which we will verify `PoW` for.
const POW_VERIFY_RATE: f32 = 0.02;

/// Verify an old block with the given header, engine, blockchain, body. If `always` is set, it will perform
/// the fullest verification possible. If not, it will take a random sample to determine whether it will
/// do heavy or light verification.
pub fn verify_old_block(rng: &mut OsRng, header: &Header, engine: &dyn EthEngine, chain: &BlockChain, always: bool) -> Result<(), ::error::Error> {
	engine.verify_block_basic(header)?;

	if always || rng.gen::<f32>() <= POW_VERIFY_RATE {
		engine.verify_block_unordered(header)?;
		match chain.block_header_data(header.parent_hash()) {
			Some(parent) => engine.verify_block_family(header, &parent.decode()?),
			None => Ok(()),
		}
	} else {
		Ok(())
	}
}