Merge pull request #2395 from ethcore/bloom-crate
Journaling bloom filter crate in util
This commit is contained in:
		
						commit
						1d3e242d37
					
				
							
								
								
									
										5
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										5
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							@ -309,6 +309,10 @@ dependencies = [
 | 
			
		||||
 "rustc_version 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "ethcore-bloom-journal"
 | 
			
		||||
version = "0.1.0"
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "ethcore-dapps"
 | 
			
		||||
version = "1.4.0"
 | 
			
		||||
@ -528,6 +532,7 @@ dependencies = [
 | 
			
		||||
 "env_logger 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
 | 
			
		||||
 "eth-secp256k1 0.5.4 (git+https://github.com/ethcore/rust-secp256k1)",
 | 
			
		||||
 "ethcore-bigint 0.1.0",
 | 
			
		||||
 "ethcore-bloom-journal 0.1.0",
 | 
			
		||||
 "ethcore-devtools 1.4.0",
 | 
			
		||||
 "heapsize 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
 | 
			
		||||
 "itertools 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)",
 | 
			
		||||
 | 
			
		||||
@ -34,6 +34,7 @@ using_queue = { path = "using_queue" }
 | 
			
		||||
table = { path = "table" }
 | 
			
		||||
ansi_term = "0.7"
 | 
			
		||||
tiny-keccak= "1.0"
 | 
			
		||||
ethcore-bloom-journal = { path = "bloom" }
 | 
			
		||||
 | 
			
		||||
[features]
 | 
			
		||||
default = []
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										9
									
								
								util/bloom/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								util/bloom/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,9 @@
 | 
			
		||||
[project]
 | 
			
		||||
name = "ethcore-bloom-journal"
 | 
			
		||||
version = "0.1.0"
 | 
			
		||||
authors = ["Ethcore<admin@ethcore.io>"]
 | 
			
		||||
description = "Journaling bloom filter"
 | 
			
		||||
license = "GPL3"
 | 
			
		||||
 | 
			
		||||
[lib]
 | 
			
		||||
path = "src/lib.rs"
 | 
			
		||||
							
								
								
									
										247
									
								
								util/bloom/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										247
									
								
								util/bloom/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,247 @@
 | 
			
		||||
// Copyright 2015, 2016 Ethcore (UK) Ltd.
 | 
			
		||||
// This file is part of Parity.
 | 
			
		||||
 | 
			
		||||
// Parity is free software: you can redistribute it and/or modify
 | 
			
		||||
// it under the terms of the GNU General Public License as published by
 | 
			
		||||
// the Free Software Foundation, either version 3 of the License, or
 | 
			
		||||
// (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
// Parity is distributed in the hope that it will be useful,
 | 
			
		||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
// GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
// You should have received a copy of the GNU General Public License
 | 
			
		||||
// along with Parity.  If not, see <http://www.gnu.org/licenses/>.
 | 
			
		||||
 | 
			
		||||
use std::cmp;
 | 
			
		||||
use std::mem;
 | 
			
		||||
use std::f64;
 | 
			
		||||
use std::hash::{Hash, Hasher, SipHasher};
 | 
			
		||||
use std::collections::HashSet;
 | 
			
		||||
 | 
			
		||||
/// BitVec structure with journalling
 | 
			
		||||
/// Every time any of the blocks is getting set it's index is tracked
 | 
			
		||||
/// and can be then drained by `drain` method
 | 
			
		||||
struct BitVecJournal {
 | 
			
		||||
    elems: Vec<u64>,
 | 
			
		||||
    journal: HashSet<usize>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl BitVecJournal {
 | 
			
		||||
	pub fn new(size: usize) -> BitVecJournal {
 | 
			
		||||
		let extra = if size % 8 > 0  { 1 } else { 0 };
 | 
			
		||||
		BitVecJournal {
 | 
			
		||||
			elems: vec![0u64; size / 8 + extra],
 | 
			
		||||
			journal: HashSet::new(),
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	pub fn from_parts(parts: &[u64]) -> BitVecJournal {
 | 
			
		||||
		BitVecJournal {
 | 
			
		||||
			elems: parts.to_vec(),
 | 
			
		||||
			journal: HashSet::new(),
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	pub fn set(&mut self, index: usize) {
 | 
			
		||||
		let e_index = index / 64;
 | 
			
		||||
		let bit_index = index % 64;
 | 
			
		||||
		let val = self.elems.get_mut(e_index).unwrap();
 | 
			
		||||
		*val |= 1u64 << bit_index;
 | 
			
		||||
		self.journal.insert(e_index);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	pub fn get(&self, index: usize) -> bool {
 | 
			
		||||
		let e_index = index / 64;
 | 
			
		||||
		let bit_index = index % 64;
 | 
			
		||||
		self.elems[e_index] & (1 << bit_index) != 0
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	pub fn drain(&mut self) -> Vec<(usize, u64)> {
 | 
			
		||||
		let journal = mem::replace(&mut self.journal, HashSet::new()).into_iter();
 | 
			
		||||
		journal.map(|idx| (idx, self.elems[idx])).collect::<Vec<(usize, u64)>>()
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	pub fn saturation(&self) -> f64 {
 | 
			
		||||
		self.elems.iter().fold(0u64, |acc, e| acc + e.count_ones() as u64) as f64 / (self.elems.len() * 64) as f64
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// Bloom filter structure
 | 
			
		||||
pub struct Bloom {
 | 
			
		||||
	bitmap: BitVecJournal,
 | 
			
		||||
	bitmap_bits: u64,
 | 
			
		||||
	k_num: u32,
 | 
			
		||||
	sips: [SipHasher; 2],
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl Bloom {
 | 
			
		||||
	/// Create a new bloom filter structure.
 | 
			
		||||
	/// bitmap_size is the size in bytes (not bits) that will be allocated in memory
 | 
			
		||||
	/// items_count is an estimation of the maximum number of items to store.
 | 
			
		||||
	pub fn new(bitmap_size: usize, items_count: usize) -> Bloom {
 | 
			
		||||
		assert!(bitmap_size > 0 && items_count > 0);
 | 
			
		||||
		let bitmap_bits = (bitmap_size as u64) * 8u64;
 | 
			
		||||
		let k_num = Bloom::optimal_k_num(bitmap_bits, items_count);
 | 
			
		||||
		let bitmap = BitVecJournal::new(bitmap_bits as usize);
 | 
			
		||||
		let sips = [Bloom::sip_new(), Bloom::sip_new()];
 | 
			
		||||
		Bloom {
 | 
			
		||||
			bitmap: bitmap,
 | 
			
		||||
			bitmap_bits: bitmap_bits,
 | 
			
		||||
			k_num: k_num,
 | 
			
		||||
			sips: sips,
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/// Initializes bloom filter from saved state
 | 
			
		||||
	pub fn from_parts(parts: &[u64], k_num: u32) -> Bloom {
 | 
			
		||||
		let bitmap_size = parts.len() * 8;
 | 
			
		||||
		let bitmap_bits = (bitmap_size as u64) * 8u64;
 | 
			
		||||
		let bitmap = BitVecJournal::from_parts(parts);
 | 
			
		||||
		let sips = [Bloom::sip_new(), Bloom::sip_new()];
 | 
			
		||||
		Bloom {
 | 
			
		||||
			bitmap: bitmap,
 | 
			
		||||
			bitmap_bits: bitmap_bits,
 | 
			
		||||
			k_num: k_num,
 | 
			
		||||
			sips: sips,
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/// Create a new bloom filter structure.
 | 
			
		||||
	/// items_count is an estimation of the maximum number of items to store.
 | 
			
		||||
	/// fp_p is the wanted rate of false positives, in ]0.0, 1.0[
 | 
			
		||||
	pub fn new_for_fp_rate(items_count: usize, fp_p: f64) -> Bloom {
 | 
			
		||||
		let bitmap_size = Bloom::compute_bitmap_size(items_count, fp_p);
 | 
			
		||||
		Bloom::new(bitmap_size, items_count)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/// Compute a recommended bitmap size for items_count items
 | 
			
		||||
	/// and a fp_p rate of false positives.
 | 
			
		||||
	/// fp_p obviously has to be within the ]0.0, 1.0[ range.
 | 
			
		||||
	pub fn compute_bitmap_size(items_count: usize, fp_p: f64) -> usize {
 | 
			
		||||
		assert!(items_count > 0);
 | 
			
		||||
		assert!(fp_p > 0.0 && fp_p < 1.0);
 | 
			
		||||
		let log2 = f64::consts::LN_2;
 | 
			
		||||
		let log2_2 = log2 * log2;
 | 
			
		||||
		((items_count as f64) * f64::ln(fp_p) / (-8.0 * log2_2)).ceil() as usize
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/// Records the presence of an item.
 | 
			
		||||
	pub fn set<T>(&mut self, item: T)
 | 
			
		||||
		where T: Hash
 | 
			
		||||
	{
 | 
			
		||||
		let mut hashes = [0u64, 0u64];
 | 
			
		||||
		for k_i in 0..self.k_num {
 | 
			
		||||
			let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize;
 | 
			
		||||
			self.bitmap.set(bit_offset);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/// Check if an item is present in the set.
 | 
			
		||||
	/// There can be false positives, but no false negatives.
 | 
			
		||||
	pub fn check<T>(&self, item: T) -> bool
 | 
			
		||||
		where T: Hash
 | 
			
		||||
	{
 | 
			
		||||
		let mut hashes = [0u64, 0u64];
 | 
			
		||||
		for k_i in 0..self.k_num {
 | 
			
		||||
			let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize;
 | 
			
		||||
			if !self.bitmap.get(bit_offset) {
 | 
			
		||||
				return false;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		true
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/// Return the number of bits in the filter
 | 
			
		||||
	pub fn number_of_bits(&self) -> u64 {
 | 
			
		||||
		self.bitmap_bits
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/// Return the number of hash functions used for `check` and `set`
 | 
			
		||||
	pub fn number_of_hash_functions(&self) -> u32 {
 | 
			
		||||
		self.k_num
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	fn optimal_k_num(bitmap_bits: u64, items_count: usize) -> u32 {
 | 
			
		||||
		let m = bitmap_bits as f64;
 | 
			
		||||
		let n = items_count as f64;
 | 
			
		||||
		let k_num = (m / n * f64::ln(2.0f64)).ceil() as u32;
 | 
			
		||||
		cmp::max(k_num, 1)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	fn bloom_hash<T>(&self, hashes: &mut [u64; 2], item: &T, k_i: u32) -> u64
 | 
			
		||||
		where T: Hash
 | 
			
		||||
	{
 | 
			
		||||
		if k_i < 2 {
 | 
			
		||||
			let sip = &mut self.sips[k_i as usize].clone();
 | 
			
		||||
			item.hash(sip);
 | 
			
		||||
			let hash = sip.finish();
 | 
			
		||||
			hashes[k_i as usize] = hash;
 | 
			
		||||
			hash
 | 
			
		||||
		} else {
 | 
			
		||||
			hashes[0].wrapping_add((k_i as u64).wrapping_mul(hashes[1]) % 0xffffffffffffffc5)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	fn sip_new() -> SipHasher {
 | 
			
		||||
		SipHasher::new()
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/// Drains the bloom journal returning the updated bloom part
 | 
			
		||||
	pub fn drain_journal(&mut self) -> BloomJournal {
 | 
			
		||||
		BloomJournal {
 | 
			
		||||
			entries: self.bitmap.drain(),
 | 
			
		||||
			hash_functions: self.k_num,
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/// Returns the ratio of set bits in the bloom filter to the total bits
 | 
			
		||||
	pub fn saturation(&self) -> f64 {
 | 
			
		||||
		self.bitmap.saturation()
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// Bloom journal
 | 
			
		||||
/// Returns the tuple of (bloom part index, bloom part value) where each one is representing
 | 
			
		||||
/// an index of bloom parts that was updated since the last drain
 | 
			
		||||
pub struct BloomJournal {
 | 
			
		||||
    pub hash_functions: u32,
 | 
			
		||||
    pub entries: Vec<(usize, u64)>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#[cfg(test)]
 | 
			
		||||
mod tests {
 | 
			
		||||
	use super::Bloom;
 | 
			
		||||
 | 
			
		||||
	#[test]
 | 
			
		||||
	fn get_set() {
 | 
			
		||||
		let mut bloom = Bloom::new(10, 80);
 | 
			
		||||
		let key = vec![115u8, 99];
 | 
			
		||||
		assert!(!bloom.check(&key));
 | 
			
		||||
		bloom.set(&key);
 | 
			
		||||
		assert!(bloom.check(&key));
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	#[test]
 | 
			
		||||
	fn journalling() {
 | 
			
		||||
		let initial = vec![0u64; 8];
 | 
			
		||||
		let mut bloom = Bloom::from_parts(&initial, 3);
 | 
			
		||||
		bloom.set(&vec![5u8, 4]);
 | 
			
		||||
		let drain = bloom.drain_journal();
 | 
			
		||||
 | 
			
		||||
		assert_eq!(2, drain.entries.len())
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	#[test]
 | 
			
		||||
	fn saturation() {
 | 
			
		||||
		let initial = vec![0u64; 8];
 | 
			
		||||
		let mut bloom = Bloom::from_parts(&initial, 3);
 | 
			
		||||
		bloom.set(&vec![5u8, 4]);
 | 
			
		||||
 | 
			
		||||
		let full = bloom.saturation();
 | 
			
		||||
		// 2/8/64 = 0.00390625
 | 
			
		||||
		assert!(full >= 0.0039f64 && full <= 0.004f64);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user