Implement option 1.

This commit is contained in:
Gav Wood 2016-03-06 21:57:55 +01:00
parent b14b4cfef8
commit 51c95d4d67

View File

@ -20,15 +20,12 @@ use common::*;
use rlp::*; use rlp::*;
use hashdb::*; use hashdb::*;
use memorydb::*; use memorydb::*;
use kvdb::{Database, DBTransaction, DatabaseConfig}; use rocksdb::{DB, Writable, WriteBatch, IteratorMode};
#[cfg(test)] #[cfg(test)]
use std::env; use std::env;
/// Implementation of the HashDB trait for a disk-backed database with a memory overlay /// Implementation of the HashDB trait for a disk-backed database with a memory overlay
/// and, possibly, latent-removal semantics. /// and latent-removal semantics.
///
/// If `counters` is `None`, then it behaves exactly like OverlayDB. If not it behaves
/// differently:
/// ///
/// Like OverlayDB, there is a memory overlay; `commit()` must be called in order to /// Like OverlayDB, there is a memory overlay; `commit()` must be called in order to
/// write operations out to disk. Unlike OverlayDB, `remove()` operations do not take effect /// write operations out to disk. Unlike OverlayDB, `remove()` operations do not take effect
@ -36,8 +33,8 @@ use std::env;
/// the removals actually take effect. /// the removals actually take effect.
pub struct JournalDB { pub struct JournalDB {
overlay: MemoryDB, overlay: MemoryDB,
backing: Arc<Database>, backing: Arc<DB>,
counters: Option<Arc<RwLock<HashMap<H256, i32>>>>, counters: Arc<RwLock<HashMap<H256, i32>>>,
} }
impl Clone for JournalDB { impl Clone for JournalDB {
@ -50,51 +47,33 @@ impl Clone for JournalDB {
} }
} }
// all keys must be at least 12 bytes const LAST_ERA_KEY : [u8; 4] = [ b'l', b'a', b's', b't' ];
const LATEST_ERA_KEY : [u8; 12] = [ b'l', b'a', b's', b't', 0, 0, 0, 0, 0, 0, 0, 0 ]; const VERSION_KEY : [u8; 4] = [ b'j', b'v', b'e', b'r' ];
const VERSION_KEY : [u8; 12] = [ b'j', b'v', b'e', b'r', 0, 0, 0, 0, 0, 0, 0, 0 ];
const DB_VERSION : u32 = 3; const DB_VERSION: u32 = 1;
const DB_VERSION_NO_JOURNAL : u32 = 3 + 256;
const PADDING : [u8; 10] = [ 0u8; 10 ];
impl JournalDB { impl JournalDB {
/// Create a new instance given a `backing` database.
/// Create a new instance from file pub fn new(backing: DB) -> JournalDB {
pub fn new(path: &str) -> JournalDB { let db = Arc::new(backing);
Self::from_prefs(path, true) JournalDB::new_with_arc(db)
} }
/// Create a new instance from file /// Create a new instance given a shared `backing` database.
pub fn from_prefs(path: &str, prefer_journal: bool) -> JournalDB { pub fn new_with_arc(backing: Arc<DB>) -> JournalDB {
let opts = DatabaseConfig { if backing.iterator(IteratorMode::Start).next().is_some() {
prefix_size: Some(12) //use 12 bytes as prefix, this must match account_db prefix
};
let backing = Database::open(&opts, path).unwrap_or_else(|e| {
panic!("Error opening state db: {}", e);
});
let with_journal;
if !backing.is_empty() {
match backing.get(&VERSION_KEY).map(|d| d.map(|v| decode::<u32>(&v))) { match backing.get(&VERSION_KEY).map(|d| d.map(|v| decode::<u32>(&v))) {
Ok(Some(DB_VERSION)) => { with_journal = true; }, Ok(Some(DB_VERSION)) => {},
Ok(Some(DB_VERSION_NO_JOURNAL)) => { with_journal = false; },
v => panic!("Incompatible DB version, expected {}, got {:?}", DB_VERSION, v) v => panic!("Incompatible DB version, expected {}, got {:?}", DB_VERSION, v)
} }
} else { } else {
backing.put(&VERSION_KEY, &encode(&(if prefer_journal { DB_VERSION } else { DB_VERSION_NO_JOURNAL }))).expect("Error writing version to database"); backing.put(&VERSION_KEY, &encode(&DB_VERSION)).expect("Error writing version to database");
with_journal = prefer_journal;
} }
let counters = JournalDB::read_counters(&backing);
let counters = if with_journal {
Some(Arc::new(RwLock::new(JournalDB::read_counters(&backing))))
} else {
None
};
JournalDB { JournalDB {
overlay: MemoryDB::new(), overlay: MemoryDB::new(),
backing: Arc::new(backing), backing: backing,
counters: counters, counters: Arc::new(RwLock::new(counters)),
} }
} }
@ -103,55 +82,93 @@ impl JournalDB {
pub fn new_temp() -> JournalDB { pub fn new_temp() -> JournalDB {
let mut dir = env::temp_dir(); let mut dir = env::temp_dir();
dir.push(H32::random().hex()); dir.push(H32::random().hex());
Self::new(dir.to_str().unwrap()) Self::new(DB::open_default(dir.to_str().unwrap()).unwrap())
} }
/// Check if this database has any commits /// Check if this database has any commits
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.backing.get(&LATEST_ERA_KEY).expect("Low level database error").is_none() self.backing.get(&LAST_ERA_KEY).expect("Low level database error").is_none()
} }
/// Commit all recent insert operations. fn morph_key(key: &H256, index: u8) -> Bytes {
pub fn commit(&mut self, now: u64, id: &H256, end: Option<(u64, H256)>) -> Result<u32, UtilError> { let mut ret = key.bytes().to_owned();
let have_counters = self.counters.is_some(); ret.push(index);
if have_counters { ret
self.commit_with_counters(now, id, end) }
// The next three are valid only as long as there is an insert operation of `key` in the journal.
fn set_already_in(batch: &WriteBatch, key: &H256) { batch.put(&Self::morph_key(key, 0), &[1u8]); }
fn reset_already_in(batch: &WriteBatch, key: &H256) { batch.delete(&Self::morph_key(key, 0)); }
fn is_already_in(backing: &DB, key: &H256) -> bool {
backing.get(&Self::morph_key(key, 0)).expect("Low-level database error. Some issue with your hard disk?").is_some()
}
fn insert_keys(inserts: &Vec<(H256, Bytes)>, backing: &DB, counters: &mut HashMap<H256, i32>, batch: &WriteBatch) {
for &(ref h, ref d) in inserts {
if let Some(c) = counters.get_mut(h) {
// already counting. increment.
*c += 1;
continue;
}
// this is the first entry for this node in the journal.
if backing.get(&h.bytes()).expect("Low-level database error. Some issue with your hard disk?").is_some() {
// already in the backing DB. start counting, and remember it was already in.
Self::set_already_in(batch, &h);
counters.insert(h.clone(), 1);
continue;
}
// Gets removed when a key leaves the journal, so should never be set when we're placing a new key.
//Self::reset_already_in(&h);
assert!(!Self::is_already_in(backing, &h));
batch.put(&h.bytes(), d);
}
}
fn replay_keys(inserts: &Vec<H256>, backing: &DB, counters: &mut HashMap<H256, i32>) {
for h in inserts {
if let Some(c) = counters.get_mut(h) {
// already counting. increment.
*c += 1;
continue;
}
// this is the first entry for this node in the journal.
// it is initialised to 1 if it was already in.
counters.insert(h.clone(), if Self::is_already_in(backing, h) {1} else {0});
}
}
fn kill_keys(deletes: Vec<H256>, counters: &mut HashMap<H256, i32>, batch: &WriteBatch) {
for h in deletes.into_iter() {
let mut n: Option<i32> = None;
if let Some(c) = counters.get_mut(&h) {
if *c > 1 {
*c -= 1;
continue;
} else { } else {
self.commit_without_counters() n = Some(*c);
} }
} }
match &n {
/// Drain the overlay and place it into a batch for the DB. &Some(i) if i == 1 => {
fn batch_overlay_insertions(overlay: &mut MemoryDB, batch: &DBTransaction) -> usize { counters.remove(&h);
let mut inserts = 0usize; Self::reset_already_in(batch, &h);
let mut deletes = 0usize;
for i in overlay.drain().into_iter() {
let (key, (value, rc)) = i;
if rc > 0 {
assert!(rc == 1);
batch.put(&key.bytes(), &value).expect("Low-level database error. Some issue with your hard disk?");
inserts += 1;
} }
if rc < 0 { &None => {
assert!(rc == -1); // Gets removed when moving from 1 to 0 additional refs. Should never be here at 0 additional refs.
deletes += 1; //assert!(!Self::is_already_in(db, &h));
batch.delete(&h.bytes());
}
_ => panic!("Invalid value in counters: {:?}", n),
} }
} }
trace!("commit: Inserted {}, Deleted {} nodes", inserts, deletes);
inserts + deletes
}
/// Just commit the overlay into the backing DB.
fn commit_without_counters(&mut self) -> Result<u32, UtilError> {
let batch = DBTransaction::new();
let ret = Self::batch_overlay_insertions(&mut self.overlay, &batch);
try!(self.backing.write(batch));
Ok(ret as u32)
} }
/// Commit all recent insert operations and historical removals from the old era /// Commit all recent insert operations and historical removals from the old era
/// to the backing database. /// to the backing database.
fn commit_with_counters(&mut self, now: u64, id: &H256, end: Option<(u64, H256)>) -> Result<u32, UtilError> { pub fn commit(&mut self, now: u64, id: &H256, end: Option<(u64, H256)>) -> Result<u32, UtilError> {
// journal format: // journal format:
// [era, 0] => [ id, [insert_0, ...], [remove_0, ...] ] // [era, 0] => [ id, [insert_0, ...], [remove_0, ...] ]
// [era, 1] => [ id, [insert_0, ...], [remove_0, ...] ] // [era, 1] => [ id, [insert_0, ...], [remove_0, ...] ]
@ -159,62 +176,84 @@ impl JournalDB {
// TODO: store reclaim_period. // TODO: store reclaim_period.
// when we make a new commit, we journal the inserts and removes. // When we make a new commit, we make a journal of all blocks in the recent history and record
// for each end_era that we journaled that we are no passing by, // all keys that were inserted and deleted. The journal is ordered by era; multiple commits can
// we remove all of its removes assuming it is canonical and all // share the same era. This forms a data structure similar to a queue but whose items are tuples.
// of its inserts otherwise. // By the time comes to remove a tuple from the queue (i.e. then the era passes from recent history
// into ancient history) then only one commit from the tuple is considered canonical. This commit
// is kept in the main backing database, whereas any others from the same era are reverted.
// //
// We also keep reference counters for each key inserted in the journal to handle // It is possible that a key, properly available in the backing database be deleted and re-inserted
// the following cases where key K must not be deleted from the DB when processing removals : // in the recent history queue, yet have both operations in commits that are eventually non-canonical.
// Given H is the journal size in eras, 0 <= C <= H. // To avoid the original, and still required, key from being deleted, we maintain a reference count
// Key K is removed in era A(N) and re-inserted in canonical era B(N + C). // which includes an original key, if any.
// Key K is removed in era A(N) and re-inserted in non-canonical era B`(N + C). //
// Key K is added in non-canonical era A'(N) canonical B(N + C). // The semantics of the `counter` are:
// insert key k:
// counter already contains k: count += 1
// counter doesn't contain k:
// backing db contains k: count = 1
// backing db doesn't contain k: insert into backing db, count = 0
// delete key k:
// counter contains k (count is asserted to be non-zero):
// count > 1: counter -= 1
// count == 1: remove counter
// count == 0: remove key from backing db
// counter doesn't contain k: remove key from backing db
//
// Practically, this means that for each commit block turning from recent to ancient we do the
// following:
// is_canonical:
// inserts: Ignored (left alone in the backing database).
// deletes: Enacted; however, recent history queue is checked for ongoing references. This is
// reduced as a preference to deletion from the backing database.
// !is_canonical:
// inserts: Reverted; however, recent history queue is checked for ongoing references. This is
// reduced as a preference to deletion from the backing database.
// deletes: Ignored (they were never inserted).
// //
// The counter is encreased each time a key is inserted in the journal in the commit. The list of insertions
// is saved with the era record. When the era becomes end_era and goes out of journal the counter is decreased
// and the key is safe to delete.
// record new commit's details. // record new commit's details.
trace!("commit: #{} ({}), end era: {:?}", now, id, end); let batch = WriteBatch::new();
let mut counters = self.counters.as_ref().unwrap().write().unwrap(); let mut counters = self.counters.write().unwrap();
let batch = DBTransaction::new();
{ {
let mut index = 0usize; let mut index = 0usize;
let mut last; let mut last;
while { while try!(self.backing.get({
let record = try!(self.backing.get({ let mut r = RlpStream::new_list(2);
let mut r = RlpStream::new_list(3);
r.append(&now); r.append(&now);
r.append(&index); r.append(&index);
r.append(&&PADDING[..]);
last = r.drain(); last = r.drain();
&last &last
})); })).is_some() {
match record {
Some(r) => {
assert!(&Rlp::new(&r).val_at::<H256>(0) != id);
true
},
None => false,
}
} {
index += 1; index += 1;
} }
let drained = self.overlay.drain();
let removes: Vec<H256> = drained
.iter()
.filter_map(|(ref k, &(_, ref c))| if *c < 0 {Some(k.clone())} else {None}).cloned()
.collect();
let inserts: Vec<(H256, Bytes)> = drained
.into_iter()
.filter_map(|(k, (v, r))| if r > 0 { assert!(r == 1); Some((k, v)) } else { assert!(r >= -1); None })
.collect();
let mut r = RlpStream::new_list(3); let mut r = RlpStream::new_list(3);
let inserts: Vec<H256> = self.overlay.keys().iter().filter(|&(_, &c)| c > 0).map(|(key, _)| key.clone()).collect();
// Increase counter for each inserted key no matter if the block is canonical or not.
for i in &inserts {
*counters.entry(i.clone()).or_insert(0) += 1;
}
let removes: Vec<H256> = self.overlay.keys().iter().filter(|&(_, &c)| c < 0).map(|(key, _)| key.clone()).collect();
r.append(id); r.append(id);
r.append(&inserts);
// Process the new inserts.
// We use the inserts for three things. For each:
// - we place into the backing DB or increment the counter if already in;
// - we note in the backing db that it was already in;
// - we write the key into our journal for this block;
r.begin_list(inserts.len());
inserts.iter().foreach(|&(k, _)| {r.append(&k);});
r.append(&removes); r.append(&removes);
Self::insert_keys(&inserts, &self.backing, &mut counters, &batch);
try!(batch.put(&last, r.as_raw())); try!(batch.put(&last, r.as_raw()));
try!(batch.put(&LATEST_ERA_KEY, &encode(&now)));
} }
// apply old commits' details // apply old commits' details
@ -222,105 +261,66 @@ impl JournalDB {
let mut index = 0usize; let mut index = 0usize;
let mut last; let mut last;
let mut to_remove: Vec<H256> = Vec::new(); let mut to_remove: Vec<H256> = Vec::new();
let mut canon_inserts: Vec<H256> = Vec::new();
while let Some(rlp_data) = try!(self.backing.get({ while let Some(rlp_data) = try!(self.backing.get({
let mut r = RlpStream::new_list(3); let mut r = RlpStream::new_list(2);
r.append(&end_era); r.append(&end_era);
r.append(&index); r.append(&index);
r.append(&&PADDING[..]);
last = r.drain(); last = r.drain();
&last &last
})) { })) {
let rlp = Rlp::new(&rlp_data); let rlp = Rlp::new(&rlp_data);
let mut inserts: Vec<H256> = rlp.val_at(1); let inserts: Vec<H256> = rlp.val_at(1);
JournalDB::decrease_counters(&inserts, &mut counters); let deletes: Vec<H256> = rlp.val_at(2);
// Collect keys to be removed. These are removed keys for canonical block, inserted for non-canonical // Collect keys to be removed. These are removed keys for canonical block, inserted for non-canonical
if canon_id == rlp.val_at(0) { Self::kill_keys(if canon_id == rlp.val_at(0) {deletes} else {inserts}, &mut counters, &batch);
let mut canon_deletes: Vec<H256> = rlp.val_at(2);
trace!("Purging nodes deleted from canon: {:?}", canon_deletes);
to_remove.append(&mut canon_deletes);
canon_inserts = inserts;
}
else {
trace!("Purging nodes inserted in non-canon: {:?}", inserts);
to_remove.append(&mut inserts);
}
trace!("commit: Delete journal for time #{}.{}: {}, (canon was {}): {} entries", end_era, index, rlp.val_at::<H256>(0), canon_id, to_remove.len());
try!(batch.delete(&last)); try!(batch.delete(&last));
index += 1; index += 1;
} }
try!(batch.put(&LAST_ERA_KEY, &encode(&end_era)));
let canon_inserts = canon_inserts.drain(..).collect::<HashSet<_>>(); trace!("JournalDB: delete journal for time #{}.{}, (canon was {})", end_era, index, canon_id);
// Purge removed keys if they are not referenced and not re-inserted in the canon commit
let mut deletes = 0;
trace!("Purging filtered nodes: {:?}", to_remove.iter().filter(|h| !counters.contains_key(h) && !canon_inserts.contains(h)).collect::<Vec<_>>());
for h in to_remove.iter().filter(|h| !counters.contains_key(h) && !canon_inserts.contains(h)) {
try!(batch.delete(&h));
deletes += 1;
}
trace!("Total nodes purged: {}", deletes);
} }
// Commit overlay insertions
let ret = Self::batch_overlay_insertions(&mut self.overlay, &batch);
try!(self.backing.write(batch)); try!(self.backing.write(batch));
Ok(ret as u32) // trace!("JournalDB::commit() deleted {} nodes", deletes);
} Ok(0)
// Decrease counters for given keys. Deletes obsolete counters
fn decrease_counters(keys: &[H256], counters: &mut HashMap<H256, i32>) {
for i in keys.iter() {
let delete_counter = {
let cnt = counters.get_mut(i).expect("Missing key counter");
*cnt -= 1;
*cnt == 0
};
if delete_counter {
counters.remove(i);
}
}
} }
fn payload(&self, key: &H256) -> Option<Bytes> { fn payload(&self, key: &H256) -> Option<Bytes> {
self.backing.get(&key.bytes()).expect("Low-level database error. Some issue with your hard disk?").map(|v| v.to_vec()) self.backing.get(&key.bytes()).expect("Low-level database error. Some issue with your hard disk?").map(|v| v.to_vec())
} }
fn read_counters(db: &Database) -> HashMap<H256, i32> { fn read_counters(db: &DB) -> HashMap<H256, i32> {
let mut res = HashMap::new(); let mut counters = HashMap::new();
if let Some(val) = db.get(&LATEST_ERA_KEY).expect("Low-level database error.") { if let Some(val) = db.get(&LAST_ERA_KEY).expect("Low-level database error.") {
let mut era = decode::<u64>(&val); let mut era = decode::<u64>(&val) + 1;
loop { loop {
let mut index = 0usize; let mut index = 0usize;
while let Some(rlp_data) = db.get({ while let Some(rlp_data) = db.get({
let mut r = RlpStream::new_list(3); let mut r = RlpStream::new_list(2);
r.append(&era); r.append(&era);
r.append(&index); r.append(&index);
r.append(&&PADDING[..]);
&r.drain() &r.drain()
}).expect("Low-level database error.") { }).expect("Low-level database error.") {
let rlp = Rlp::new(&rlp_data); let rlp = Rlp::new(&rlp_data);
let to_add: Vec<H256> = rlp.val_at(1); let inserts: Vec<H256> = rlp.val_at(1);
for h in to_add { Self::replay_keys(&inserts, db, &mut counters);
*res.entry(h).or_insert(0) += 1;
}
index += 1; index += 1;
}; };
if index == 0 || era == 0 { if index == 0 {
break; break;
} }
era -= 1; era += 1;
} }
} }
trace!("Recovered {} counters", res.len()); trace!("Recovered {} counters", counters.len());
res counters
} }
} }
impl HashDB for JournalDB { impl HashDB for JournalDB {
fn keys(&self) -> HashMap<H256, i32> { fn keys(&self) -> HashMap<H256, i32> {
let mut ret: HashMap<H256, i32> = HashMap::new(); let mut ret: HashMap<H256, i32> = HashMap::new();
for (key, _) in self.backing.iter() { for (key, _) in self.backing.iterator(IteratorMode::Start) {
let h = H256::from_slice(key.deref()); let h = H256::from_slice(key.deref());
ret.insert(h, 1); ret.insert(h, 1);
} }
@ -368,6 +368,28 @@ mod tests {
use super::*; use super::*;
use hashdb::*; use hashdb::*;
#[test]
fn insert_same_in_fork() {
// history is 1
let mut jdb = JournalDB::new_temp();
let x = jdb.insert(b"X");
jdb.commit(1, &b"1".sha3(), None).unwrap();
jdb.commit(2, &b"2".sha3(), None).unwrap();
jdb.commit(3, &b"1002a".sha3(), Some((1, b"1".sha3()))).unwrap();
jdb.commit(4, &b"1003a".sha3(), Some((2, b"2".sha3()))).unwrap();
jdb.remove(&x);
jdb.commit(3, &b"1002b".sha3(), Some((1, b"1".sha3()))).unwrap();
let x = jdb.insert(b"X");
jdb.commit(4, &b"1003b".sha3(), Some((2, b"2".sha3()))).unwrap();
jdb.commit(5, &b"1004a".sha3(), Some((3, b"1002a".sha3()))).unwrap();
jdb.commit(6, &b"1005a".sha3(), Some((4, b"1003a".sha3()))).unwrap();
assert!(jdb.exists(&x));
}
#[test] #[test]
fn long_history() { fn long_history() {
// history is 3 // history is 3
@ -487,31 +509,4 @@ mod tests {
jdb.commit(2, &b"2a".sha3(), Some((1, b"1a".sha3()))).unwrap(); jdb.commit(2, &b"2a".sha3(), Some((1, b"1a".sha3()))).unwrap();
assert!(jdb.exists(&foo)); assert!(jdb.exists(&foo));
} }
#[test]
fn reopen() {
let mut dir = ::std::env::temp_dir();
dir.push(H32::random().hex());
let foo = {
let mut jdb = JournalDB::new(dir.to_str().unwrap());
// history is 1
let foo = jdb.insert(b"foo");
jdb.commit(0, &b"0".sha3(), None).unwrap();
foo
};
{
let mut jdb = JournalDB::new(dir.to_str().unwrap());
jdb.remove(&foo);
jdb.commit(1, &b"1".sha3(), Some((0, b"0".sha3()))).unwrap();
}
{
let mut jdb = JournalDB::new(dir.to_str().unwrap());
assert!(jdb.exists(&foo));
jdb.commit(2, &b"2".sha3(), Some((1, b"1".sha3()))).unwrap();
assert!(!jdb.exists(&foo));
}
}
} }