From b477ca17fedca779a8b5895eef652096548474f2 Mon Sep 17 00:00:00 2001 From: NikVolf Date: Thu, 29 Sep 2016 13:19:39 +0300 Subject: [PATCH 1/4] bloom filter crate --- Cargo.lock | 5 + util/Cargo.toml | 1 + util/bloom/Cargo.toml | 9 ++ util/bloom/src/lib.rs | 240 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 255 insertions(+) create mode 100644 util/bloom/Cargo.toml create mode 100644 util/bloom/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 840684b2d..15cb17c88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -309,6 +309,10 @@ dependencies = [ "rustc_version 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "ethcore-bloom-journal" +version = "0.1.0" + [[package]] name = "ethcore-dapps" version = "1.4.0" @@ -528,6 +532,7 @@ dependencies = [ "env_logger 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "eth-secp256k1 0.5.4 (git+https://github.com/ethcore/rust-secp256k1)", "ethcore-bigint 0.1.0", + "ethcore-bloom-journal 0.1.0", "ethcore-devtools 1.4.0", "heapsize 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "itertools 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/util/Cargo.toml b/util/Cargo.toml index 81916555c..520a4e003 100644 --- a/util/Cargo.toml +++ b/util/Cargo.toml @@ -34,6 +34,7 @@ using_queue = { path = "using_queue" } table = { path = "table" } ansi_term = "0.7" tiny-keccak= "1.0" +ethcore-bloom-journal = { path = "bloom" } [features] default = [] diff --git a/util/bloom/Cargo.toml b/util/bloom/Cargo.toml new file mode 100644 index 000000000..5397c691b --- /dev/null +++ b/util/bloom/Cargo.toml @@ -0,0 +1,9 @@ +[project] +name = "ethcore-bloom-journal" +version = "0.1.0" +authors = ["Ethcore"] +description = "Journaling bloom filter" +license = "GPL3" + +[lib] +path = "src/lib.rs" diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs new file mode 100644 index 000000000..b2b926166 --- /dev/null +++ b/util/bloom/src/lib.rs @@ -0,0 +1,240 @@ +// Copyright 2015, 2016 Ethcore (UK) Ltd. +// This file is part of Parity. + +// Parity is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Parity is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Parity. If not, see . + +use std::cmp; +use std::f64; +use std::hash::{Hash, Hasher, SipHasher}; +use std::collections::HashSet; + +/// BitVec structure with journalling +/// Every time any of the blocks is getting set it's index is tracked +/// and can be then drained by `drain` method +struct BitVecJournal { + elems: Vec, + journal: HashSet, +} + +impl BitVecJournal { + pub fn new(size: usize) -> BitVecJournal { + let extra = if size % 8 > 0 { 1 } else { 0 }; + BitVecJournal { + elems: vec![0u64; size / 8 + extra], + journal: HashSet::new(), + } + } + + pub fn from_parts(parts: &[u64]) -> BitVecJournal { + BitVecJournal { + elems: parts.to_vec(), + journal: HashSet::new(), + } + } + + pub fn set(&mut self, index: usize) { + let e_index = index / 64; + let bit_index = index % 64; + let val = self.elems.get_mut(e_index).unwrap(); + *val |= 1u64 << bit_index; + self.journal.insert(e_index); + } + + pub fn get(&self, index: usize) -> bool { + let e_index = index / 64; + let bit_index = index % 64; + self.elems[e_index] & (1 << bit_index) != 0 + } + + pub fn drain(&mut self) -> Vec<(usize, u64)> { + let journal = self.journal.drain().collect::>(); + journal.iter().map(|idx| (*idx, self.elems[*idx])).collect::>() + } + + pub fn how_full(&self) -> f64 { + self.elems.iter().fold(0u64, |acc, e| acc + e.count_ones() as u64) as f64 / (self.elems.len() * 64) as f64 + } +} + +/// Bloom filter structure +pub struct Bloom { + bitmap: BitVecJournal, + bitmap_bits: u64, + k_num: u32, + sips: [SipHasher; 2], +} + +impl Bloom { + /// Create a new bloom filter structure. + /// bitmap_size is the size in bytes (not bits) that will be allocated in memory + /// items_count is an estimation of the maximum number of items to store. + pub fn new(bitmap_size: usize, items_count: usize) -> Bloom { + assert!(bitmap_size > 0 && items_count > 0); + let bitmap_bits = (bitmap_size as u64) * 8u64; + let k_num = Bloom::optimal_k_num(bitmap_bits, items_count); + let bitmap = BitVecJournal::new(bitmap_bits as usize); + let sips = [Bloom::sip_new(), Bloom::sip_new()]; + Bloom { + bitmap: bitmap, + bitmap_bits: bitmap_bits, + k_num: k_num, + sips: sips, + } + } + + /// Initializes bloom filter from saved state + pub fn from_parts(parts: &[u64], k_num: u32) -> Bloom { + let bitmap_size = parts.len()*8; + let bitmap_bits = (bitmap_size as u64) * 8u64; + let bitmap = BitVecJournal::from_parts(parts); + let sips = [Bloom::sip_new(), Bloom::sip_new()]; + Bloom { + bitmap: bitmap, + bitmap_bits: bitmap_bits, + k_num: k_num, + sips: sips, + } + } + + /// Create a new bloom filter structure. + /// items_count is an estimation of the maximum number of items to store. + /// fp_p is the wanted rate of false positives, in ]0.0, 1.0[ + pub fn new_for_fp_rate(items_count: usize, fp_p: f64) -> Bloom { + let bitmap_size = Bloom::compute_bitmap_size(items_count, fp_p); + Bloom::new(bitmap_size, items_count) + } + + /// Compute a recommended bitmap size for items_count items + /// and a fp_p rate of false positives. + /// fp_p obviously has to be within the ]0.0, 1.0[ range. + pub fn compute_bitmap_size(items_count: usize, fp_p: f64) -> usize { + assert!(items_count > 0); + assert!(fp_p > 0.0 && fp_p < 1.0); + let log2 = f64::consts::LN_2; + let log2_2 = log2 * log2; + ((items_count as f64) * f64::ln(fp_p) / (-8.0 * log2_2)).ceil() as usize + } + + /// Records the presence of an item. + pub fn set(&mut self, item: T) + where T: Hash + { + let mut hashes = [0u64, 0u64]; + for k_i in 0..self.k_num { + let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize; + self.bitmap.set(bit_offset); + } + } + + /// Check if an item is present in the set. + /// There can be false positives, but no false negatives. + pub fn check(&self, item: T) -> bool + where T: Hash + { + let mut hashes = [0u64, 0u64]; + for k_i in 0..self.k_num { + let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize; + if !self.bitmap.get(bit_offset) { + return false; + } + } + true + } + + /// Return the number of bits in the filter + pub fn number_of_bits(&self) -> u64 { + self.bitmap_bits + } + + /// Return the number of hash functions used for `check` and `set` + pub fn number_of_hash_functions(&self) -> u32 { + self.k_num + } + + fn optimal_k_num(bitmap_bits: u64, items_count: usize) -> u32 { + let m = bitmap_bits as f64; + let n = items_count as f64; + let k_num = (m / n * f64::ln(2.0f64)).ceil() as u32; + cmp::max(k_num, 1) + } + + fn bloom_hash(&self, hashes: &mut [u64; 2], item: &T, k_i: u32) -> u64 + where T: Hash + { + if k_i < 2 { + let sip = &mut self.sips[k_i as usize].clone(); + item.hash(sip); + let hash = sip.finish(); + hashes[k_i as usize] = hash; + hash + } else { + hashes[0].wrapping_add((k_i as u64).wrapping_mul(hashes[1]) % 0xffffffffffffffc5) + } + } + + fn sip_new() -> SipHasher { + SipHasher::new() + } + + /// Drains the bloom journal returning the updated bloom part + pub fn drain_journal(&mut self) -> BloomJournal { + BloomJournal { + entries: self.bitmap.drain(), + hash_functions: self.k_num, + } + } + + /// Returns the ratio of set bits in the bloom filter to the total bits + pub fn how_full(&self) -> f64 { + self.bitmap.how_full() + } +} + +/// Bloom journal +/// Returns the tuple of (bloom part index, bloom part value) where each one is representing +/// an index of bloom parts that was updated since the last drain +pub struct BloomJournal { + pub hash_functions: u32, + pub entries: Vec<(usize, u64)>, +} + +#[test] +fn bloom_test_set() { + let mut bloom = Bloom::new(10, 80); + let key = vec![115u8, 99]; + assert!(!bloom.check(&key)); + bloom.set(&key); + assert!(bloom.check(&key)); +} + +#[test] +fn bloom_journalling() { + let initial = vec![0u64; 8]; + let mut bloom = Bloom::from_parts(&initial, 3); + bloom.set(&vec![5u8, 4]); + let drain = bloom.drain_journal(); + + assert_eq!(2, drain.entries.len()) +} + +#[test] +fn bloom_howfull() { + let initial = vec![0u64; 8]; + let mut bloom = Bloom::from_parts(&initial, 3); + bloom.set(&vec![5u8, 4]); + + let full = bloom.how_full(); + // 2/8/64 = 0.00390625 + assert!(full >= 0.0039f64 && full <= 0.004f64); +} From 59c0551ff468ed36e9c6e9dc450ff811978bd28c Mon Sep 17 00:00:00 2001 From: NikVolf Date: Thu, 29 Sep 2016 13:39:13 +0300 Subject: [PATCH 2/4] separate mod for tests --- util/bloom/src/lib.rs | 62 ++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs index b2b926166..9cf0e89f1 100644 --- a/util/bloom/src/lib.rs +++ b/util/bloom/src/lib.rs @@ -209,32 +209,38 @@ pub struct BloomJournal { pub entries: Vec<(usize, u64)>, } -#[test] -fn bloom_test_set() { - let mut bloom = Bloom::new(10, 80); - let key = vec![115u8, 99]; - assert!(!bloom.check(&key)); - bloom.set(&key); - assert!(bloom.check(&key)); -} - -#[test] -fn bloom_journalling() { - let initial = vec![0u64; 8]; - let mut bloom = Bloom::from_parts(&initial, 3); - bloom.set(&vec![5u8, 4]); - let drain = bloom.drain_journal(); - - assert_eq!(2, drain.entries.len()) -} - -#[test] -fn bloom_howfull() { - let initial = vec![0u64; 8]; - let mut bloom = Bloom::from_parts(&initial, 3); - bloom.set(&vec![5u8, 4]); - - let full = bloom.how_full(); - // 2/8/64 = 0.00390625 - assert!(full >= 0.0039f64 && full <= 0.004f64); + +#[cfg(test)] +mod tests { + use super::Bloom; + + #[test] + fn bloom_test_set() { + let mut bloom = Bloom::new(10, 80); + let key = vec![115u8, 99]; + assert!(!bloom.check(&key)); + bloom.set(&key); + assert!(bloom.check(&key)); + } + + #[test] + fn bloom_journalling() { + let initial = vec![0u64; 8]; + let mut bloom = Bloom::from_parts(&initial, 3); + bloom.set(&vec![5u8, 4]); + let drain = bloom.drain_journal(); + + assert_eq!(2, drain.entries.len()) + } + + #[test] + fn bloom_howfull() { + let initial = vec![0u64; 8]; + let mut bloom = Bloom::from_parts(&initial, 3); + bloom.set(&vec![5u8, 4]); + + let full = bloom.how_full(); + // 2/8/64 = 0.00390625 + assert!(full >= 0.0039f64 && full <= 0.004f64); + } } From fa050246afbef8e259e06218615d594be7f9fdbf Mon Sep 17 00:00:00 2001 From: NikVolf Date: Fri, 30 Sep 2016 20:43:57 +0300 Subject: [PATCH 3/4] removed redundant memcopy --- util/bloom/src/lib.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs index 9cf0e89f1..9d637965a 100644 --- a/util/bloom/src/lib.rs +++ b/util/bloom/src/lib.rs @@ -15,6 +15,7 @@ // along with Parity. If not, see . use std::cmp; +use std::mem; use std::f64; use std::hash::{Hash, Hasher, SipHasher}; use std::collections::HashSet; @@ -58,8 +59,8 @@ impl BitVecJournal { } pub fn drain(&mut self) -> Vec<(usize, u64)> { - let journal = self.journal.drain().collect::>(); - journal.iter().map(|idx| (*idx, self.elems[*idx])).collect::>() + let journal = mem::replace(&mut self.journal, HashSet::new()).into_iter(); + journal.map(|idx| (idx, self.elems[idx])).collect::>() } pub fn how_full(&self) -> f64 { From 18630496d5a24a70843a19ef0898f30cdbc79bea Mon Sep 17 00:00:00 2001 From: NikVolf Date: Fri, 30 Sep 2016 21:02:16 +0300 Subject: [PATCH 4/4] asterisk space --- util/bloom/src/lib.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs index 9d637965a..582437651 100644 --- a/util/bloom/src/lib.rs +++ b/util/bloom/src/lib.rs @@ -63,7 +63,7 @@ impl BitVecJournal { journal.map(|idx| (idx, self.elems[idx])).collect::>() } - pub fn how_full(&self) -> f64 { + pub fn saturation(&self) -> f64 { self.elems.iter().fold(0u64, |acc, e| acc + e.count_ones() as u64) as f64 / (self.elems.len() * 64) as f64 } } @@ -96,7 +96,7 @@ impl Bloom { /// Initializes bloom filter from saved state pub fn from_parts(parts: &[u64], k_num: u32) -> Bloom { - let bitmap_size = parts.len()*8; + let bitmap_size = parts.len() * 8; let bitmap_bits = (bitmap_size as u64) * 8u64; let bitmap = BitVecJournal::from_parts(parts); let sips = [Bloom::sip_new(), Bloom::sip_new()]; @@ -197,8 +197,8 @@ impl Bloom { } /// Returns the ratio of set bits in the bloom filter to the total bits - pub fn how_full(&self) -> f64 { - self.bitmap.how_full() + pub fn saturation(&self) -> f64 { + self.bitmap.saturation() } } @@ -216,7 +216,7 @@ mod tests { use super::Bloom; #[test] - fn bloom_test_set() { + fn get_set() { let mut bloom = Bloom::new(10, 80); let key = vec![115u8, 99]; assert!(!bloom.check(&key)); @@ -225,7 +225,7 @@ mod tests { } #[test] - fn bloom_journalling() { + fn journalling() { let initial = vec![0u64; 8]; let mut bloom = Bloom::from_parts(&initial, 3); bloom.set(&vec![5u8, 4]); @@ -235,12 +235,12 @@ mod tests { } #[test] - fn bloom_howfull() { + fn saturation() { let initial = vec![0u64; 8]; let mut bloom = Bloom::from_parts(&initial, 3); bloom.set(&vec![5u8, 4]); - let full = bloom.how_full(); + let full = bloom.saturation(); // 2/8/64 = 0.00390625 assert!(full >= 0.0039f64 && full <= 0.004f64); }