openethereum/util/bloom/src/lib.rs

// Copyright 2015, 2016 Ethcore (UK) Ltd.
// This file is part of Parity.

// Parity is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Parity is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Parity.  If not, see <http://www.gnu.org/licenses/>.


extern crate siphasher;

use std::cmp;
use std::mem;
use std::f64;
use std::hash::{Hash, Hasher};
use std::collections::HashSet;
use siphasher::sip::SipHasher;

// TODO [ToDr] Both hashers are exactly the same - no point to keep two.
const NUMBER_OF_HASHERS: usize = 2;

/// BitVec structure with journalling
/// Every time any of the blocks is getting set it's index is tracked
/// and can be then drained by `drain` method
struct BitVecJournal {
    elems: Vec<u64>,
    journal: HashSet<usize>,
}

impl BitVecJournal {
	pub fn new(size: usize) -> BitVecJournal {
		let extra = if size % 8 > 0  { 1 } else { 0 };
		BitVecJournal {
			elems: vec![0u64; size / 8 + extra],
			journal: HashSet::new(),
		}
	}

	pub fn from_parts(parts: &[u64]) -> BitVecJournal {
		BitVecJournal {
			elems: parts.to_vec(),
			journal: HashSet::new(),
		}
	}

	pub fn set(&mut self, index: usize) {
		let e_index = index / 64;
		let bit_index = index % 64;
		let val = self.elems.get_mut(e_index).unwrap();
		*val |= 1u64 << bit_index;
		self.journal.insert(e_index);
	}

	pub fn get(&self, index: usize) -> bool {
		let e_index = index / 64;
		let bit_index = index % 64;
		self.elems[e_index] & (1 << bit_index) != 0
	}

	pub fn drain(&mut self) -> Vec<(usize, u64)> {
		let journal = mem::replace(&mut self.journal, HashSet::new()).into_iter();
		journal.map(|idx| (idx, self.elems[idx])).collect::<Vec<(usize, u64)>>()
	}

	pub fn saturation(&self) -> f64 {
		self.elems.iter().fold(0u64, |acc, e| acc + e.count_ones() as u64) as f64 / (self.elems.len() * 64) as f64
	}
}

/// Bloom filter structure
pub struct Bloom {
	bitmap: BitVecJournal,
	bitmap_bits: u64,
	k_num: u32,
	// TODO [ToDr] Both hashers are exactly the same - no point to keep two.
	sips: [SipHasher; NUMBER_OF_HASHERS],
}

impl Bloom {
	/// Create a new bloom filter structure.
	/// bitmap_size is the size in bytes (not bits) that will be allocated in memory
	/// items_count is an estimation of the maximum number of items to store.
	pub fn new(bitmap_size: usize, items_count: usize) -> Bloom {
		assert!(bitmap_size > 0 && items_count > 0);
		let bitmap_bits = (bitmap_size as u64) * 8u64;
		let k_num = Bloom::optimal_k_num(bitmap_bits, items_count);
		let bitmap = BitVecJournal::new(bitmap_bits as usize);
		let sips = [SipHasher::new(), SipHasher::new()];
		Bloom {
			bitmap: bitmap,
			bitmap_bits: bitmap_bits,
			k_num: k_num,
			sips: sips,
		}
	}

	/// Initializes bloom filter from saved state
	pub fn from_parts(parts: &[u64], k_num: u32) -> Bloom {
		let bitmap_size = parts.len() * 8;
		let bitmap_bits = (bitmap_size as u64) * 8u64;
		let bitmap = BitVecJournal::from_parts(parts);
		let sips = [SipHasher::new(), SipHasher::new()];
		Bloom {
			bitmap: bitmap,
			bitmap_bits: bitmap_bits,
			k_num: k_num,
			sips: sips,
		}
	}

	/// Create a new bloom filter structure.
	/// items_count is an estimation of the maximum number of items to store.
	/// fp_p is the wanted rate of false positives, in ]0.0, 1.0[
	pub fn new_for_fp_rate(items_count: usize, fp_p: f64) -> Bloom {
		let bitmap_size = Bloom::compute_bitmap_size(items_count, fp_p);
		Bloom::new(bitmap_size, items_count)
	}

	/// Compute a recommended bitmap size for items_count items
	/// and a fp_p rate of false positives.
	/// fp_p obviously has to be within the ]0.0, 1.0[ range.
	pub fn compute_bitmap_size(items_count: usize, fp_p: f64) -> usize {
		assert!(items_count > 0);
		assert!(fp_p > 0.0 && fp_p < 1.0);
		let log2 = f64::consts::LN_2;
		let log2_2 = log2 * log2;
		((items_count as f64) * f64::ln(fp_p) / (-8.0 * log2_2)).ceil() as usize
	}

	/// Records the presence of an item.
	pub fn set<T>(&mut self, item: T)
		where T: Hash
	{
		let mut hashes = [0u64, 0u64];
		for k_i in 0..self.k_num {
			let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize;
			self.bitmap.set(bit_offset);
		}
	}

	/// Check if an item is present in the set.
	/// There can be false positives, but no false negatives.
	pub fn check<T>(&self, item: T) -> bool
		where T: Hash
	{
		let mut hashes = [0u64, 0u64];
		for k_i in 0..self.k_num {
			let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize;
			if !self.bitmap.get(bit_offset) {
				return false;
			}
		}
		true
	}

	/// Return the number of bits in the filter
	pub fn number_of_bits(&self) -> u64 {
		self.bitmap_bits
	}

	/// Return the number of hash functions used for `check` and `set`
	pub fn number_of_hash_functions(&self) -> u32 {
		self.k_num
	}

	fn optimal_k_num(bitmap_bits: u64, items_count: usize) -> u32 {
		let m = bitmap_bits as f64;
		let n = items_count as f64;
		let k_num = (m / n * f64::ln(2.0f64)).ceil() as u32;
		cmp::max(k_num, 1)
	}

	fn bloom_hash<T>(&self, hashes: &mut [u64; NUMBER_OF_HASHERS], item: &T, k_i: u32) -> u64
		where T: Hash
	{
		if k_i < NUMBER_OF_HASHERS as u32 {
			let mut sip = self.sips[k_i as usize].clone();
			item.hash(&mut sip);
			let hash = sip.finish();
			hashes[k_i as usize] = hash;
			hash
		} else {
			hashes[0].wrapping_add((k_i as u64).wrapping_mul(hashes[1]) % 0xffffffffffffffc5)
		}
	}

	/// Drains the bloom journal returning the updated bloom part
	pub fn drain_journal(&mut self) -> BloomJournal {
		BloomJournal {
			entries: self.bitmap.drain(),
			hash_functions: self.k_num,
		}
	}

	/// Returns the ratio of set bits in the bloom filter to the total bits
	pub fn saturation(&self) -> f64 {
		self.bitmap.saturation()
	}
}

/// Bloom journal
/// Returns the tuple of (bloom part index, bloom part value) where each one is representing
/// an index of bloom parts that was updated since the last drain
pub struct BloomJournal {
    pub hash_functions: u32,
    pub entries: Vec<(usize, u64)>,
}


#[cfg(test)]
mod tests {
	use super::Bloom;

	#[test]
	fn get_set() {
		let mut bloom = Bloom::new(10, 80);
		let key = vec![115u8, 99];
		assert!(!bloom.check(&key));
		bloom.set(&key);
		assert!(bloom.check(&key));
	}

	#[test]
	fn journalling() {
		let initial = vec![0u64; 8];
		let mut bloom = Bloom::from_parts(&initial, 3);
		bloom.set(&vec![5u8, 4]);
		let drain = bloom.drain_journal();

		assert_eq!(2, drain.entries.len())
	}

	#[test]
	fn saturation() {
		let initial = vec![0u64; 8];
		let mut bloom = Bloom::from_parts(&initial, 3);
		bloom.set(&vec![5u8, 4]);

		let full = bloom.saturation();
		// 2/8/64 = 0.00390625
		assert!(full >= 0.0039f64 && full <= 0.004f64);
	}
}
bloom filter crate 2016-09-29 12:19:39 +02:00			`// Copyright 2015, 2016 Ethcore (UK) Ltd.`
			`// This file is part of Parity.`

			`// Parity is free software: you can redistribute it and/or modify`
			`// it under the terms of the GNU General Public License as published by`
			`// the Free Software Foundation, either version 3 of the License, or`
			`// (at your option) any later version.`

			`// Parity is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU General Public License for more details.`

			`// You should have received a copy of the GNU General Public License`
			`// along with Parity. If not, see <http://www.gnu.org/licenses/>.`

Using SipHasher from crates.io 2016-10-21 11:57:30 +02:00
			`extern crate siphasher;`

bloom filter crate 2016-09-29 12:19:39 +02:00			`use std::cmp;`
removed redundant memcopy 2016-09-30 19:43:57 +02:00			`use std::mem;`
bloom filter crate 2016-09-29 12:19:39 +02:00			`use std::f64;`
Using SipHasher from crates.io 2016-10-21 11:57:30 +02:00			`use std::hash::{Hash, Hasher};`
bloom filter crate 2016-09-29 12:19:39 +02:00			`use std::collections::HashSet;`
Using SipHasher from crates.io 2016-10-21 11:57:30 +02:00			`use siphasher::sip::SipHasher;`

			`// TODO [ToDr] Both hashers are exactly the same - no point to keep two.`
			`const NUMBER_OF_HASHERS: usize = 2;`
bloom filter crate 2016-09-29 12:19:39 +02:00
			`/// BitVec structure with journalling`
			`/// Every time any of the blocks is getting set it's index is tracked`
			/// and can be then drained by `drain` method
			`struct BitVecJournal {`
			`elems: Vec<u64>,`
			`journal: HashSet<usize>,`
			`}`

			`impl BitVecJournal {`
			`pub fn new(size: usize) -> BitVecJournal {`
			`let extra = if size % 8 > 0 { 1 } else { 0 };`
			`BitVecJournal {`
			`elems: vec![0u64; size / 8 + extra],`
			`journal: HashSet::new(),`
			`}`
			`}`

			`pub fn from_parts(parts: &[u64]) -> BitVecJournal {`
			`BitVecJournal {`
			`elems: parts.to_vec(),`
			`journal: HashSet::new(),`
			`}`
			`}`

			`pub fn set(&mut self, index: usize) {`
			`let e_index = index / 64;`
			`let bit_index = index % 64;`
			`let val = self.elems.get_mut(e_index).unwrap();`
			`*val \|= 1u64 << bit_index;`
			`self.journal.insert(e_index);`
			`}`

			`pub fn get(&self, index: usize) -> bool {`
			`let e_index = index / 64;`
			`let bit_index = index % 64;`
			`self.elems[e_index] & (1 << bit_index) != 0`
			`}`

			`pub fn drain(&mut self) -> Vec<(usize, u64)> {`
removed redundant memcopy 2016-09-30 19:43:57 +02:00			`let journal = mem::replace(&mut self.journal, HashSet::new()).into_iter();`
			`journal.map(\|idx\| (idx, self.elems[idx])).collect::<Vec<(usize, u64)>>()`
bloom filter crate 2016-09-29 12:19:39 +02:00			`}`

asterisk space 2016-09-30 20:02:16 +02:00			`pub fn saturation(&self) -> f64 {`
bloom filter crate 2016-09-29 12:19:39 +02:00			`self.elems.iter().fold(0u64, \|acc, e\| acc + e.count_ones() as u64) as f64 / (self.elems.len() * 64) as f64`
			`}`
			`}`

			`/// Bloom filter structure`
			`pub struct Bloom {`
			`bitmap: BitVecJournal,`
			`bitmap_bits: u64,`
			`k_num: u32,`
Using SipHasher from crates.io 2016-10-21 11:57:30 +02:00			`// TODO [ToDr] Both hashers are exactly the same - no point to keep two.`
			`sips: [SipHasher; NUMBER_OF_HASHERS],`
bloom filter crate 2016-09-29 12:19:39 +02:00			`}`

			`impl Bloom {`
			`/// Create a new bloom filter structure.`
			`/// bitmap_size is the size in bytes (not bits) that will be allocated in memory`
			`/// items_count is an estimation of the maximum number of items to store.`
			`pub fn new(bitmap_size: usize, items_count: usize) -> Bloom {`
			`assert!(bitmap_size > 0 && items_count > 0);`
			`let bitmap_bits = (bitmap_size as u64) * 8u64;`
			`let k_num = Bloom::optimal_k_num(bitmap_bits, items_count);`
			`let bitmap = BitVecJournal::new(bitmap_bits as usize);`
Using SipHasher from crates.io 2016-10-21 11:57:30 +02:00			`let sips = [SipHasher::new(), SipHasher::new()];`
bloom filter crate 2016-09-29 12:19:39 +02:00			`Bloom {`
			`bitmap: bitmap,`
			`bitmap_bits: bitmap_bits,`
			`k_num: k_num,`
			`sips: sips,`
			`}`
			`}`

			`/// Initializes bloom filter from saved state`
			`pub fn from_parts(parts: &[u64], k_num: u32) -> Bloom {`
asterisk space 2016-09-30 20:02:16 +02:00			`let bitmap_size = parts.len() * 8;`
bloom filter crate 2016-09-29 12:19:39 +02:00			`let bitmap_bits = (bitmap_size as u64) * 8u64;`
			`let bitmap = BitVecJournal::from_parts(parts);`
Using SipHasher from crates.io 2016-10-21 11:57:30 +02:00			`let sips = [SipHasher::new(), SipHasher::new()];`
bloom filter crate 2016-09-29 12:19:39 +02:00			`Bloom {`
			`bitmap: bitmap,`
			`bitmap_bits: bitmap_bits,`
			`k_num: k_num,`
			`sips: sips,`
			`}`
			`}`

			`/// Create a new bloom filter structure.`
			`/// items_count is an estimation of the maximum number of items to store.`
			`/// fp_p is the wanted rate of false positives, in ]0.0, 1.0[`
			`pub fn new_for_fp_rate(items_count: usize, fp_p: f64) -> Bloom {`
			`let bitmap_size = Bloom::compute_bitmap_size(items_count, fp_p);`
			`Bloom::new(bitmap_size, items_count)`
			`}`

			`/// Compute a recommended bitmap size for items_count items`
			`/// and a fp_p rate of false positives.`
			`/// fp_p obviously has to be within the ]0.0, 1.0[ range.`
			`pub fn compute_bitmap_size(items_count: usize, fp_p: f64) -> usize {`
			`assert!(items_count > 0);`
			`assert!(fp_p > 0.0 && fp_p < 1.0);`
			`let log2 = f64::consts::LN_2;`
			`let log2_2 = log2 * log2;`
			`((items_count as f64) * f64::ln(fp_p) / (-8.0 * log2_2)).ceil() as usize`
			`}`

			`/// Records the presence of an item.`
			`pub fn set<T>(&mut self, item: T)`
			`where T: Hash`
			`{`
			`let mut hashes = [0u64, 0u64];`
			`for k_i in 0..self.k_num {`
			`let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize;`
			`self.bitmap.set(bit_offset);`
			`}`
			`}`

			`/// Check if an item is present in the set.`
			`/// There can be false positives, but no false negatives.`
			`pub fn check<T>(&self, item: T) -> bool`
			`where T: Hash`
			`{`
			`let mut hashes = [0u64, 0u64];`
			`for k_i in 0..self.k_num {`
			`let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize;`
			`if !self.bitmap.get(bit_offset) {`
			`return false;`
			`}`
			`}`
			`true`
			`}`

			`/// Return the number of bits in the filter`
			`pub fn number_of_bits(&self) -> u64 {`
			`self.bitmap_bits`
			`}`

			/// Return the number of hash functions used for `check` and `set`
			`pub fn number_of_hash_functions(&self) -> u32 {`
			`self.k_num`
			`}`

			`fn optimal_k_num(bitmap_bits: u64, items_count: usize) -> u32 {`
			`let m = bitmap_bits as f64;`
			`let n = items_count as f64;`
			`let k_num = (m / n * f64::ln(2.0f64)).ceil() as u32;`
			`cmp::max(k_num, 1)`
			`}`

Using SipHasher from crates.io 2016-10-21 11:57:30 +02:00			`fn bloom_hash<T>(&self, hashes: &mut [u64; NUMBER_OF_HASHERS], item: &T, k_i: u32) -> u64`
bloom filter crate 2016-09-29 12:19:39 +02:00			`where T: Hash`
			`{`
Using SipHasher from crates.io 2016-10-21 11:57:30 +02:00			`if k_i < NUMBER_OF_HASHERS as u32 {`
			`let mut sip = self.sips[k_i as usize].clone();`
Using DefaultHasher instead of SipHasher 2016-10-21 11:16:55 +02:00			`item.hash(&mut sip);`
bloom filter crate 2016-09-29 12:19:39 +02:00			`let hash = sip.finish();`
			`hashes[k_i as usize] = hash;`
			`hash`
			`} else {`
			`hashes[0].wrapping_add((k_i as u64).wrapping_mul(hashes[1]) % 0xffffffffffffffc5)`
			`}`
			`}`

			`/// Drains the bloom journal returning the updated bloom part`
			`pub fn drain_journal(&mut self) -> BloomJournal {`
			`BloomJournal {`
			`entries: self.bitmap.drain(),`
			`hash_functions: self.k_num,`
			`}`
			`}`

			`/// Returns the ratio of set bits in the bloom filter to the total bits`
asterisk space 2016-09-30 20:02:16 +02:00			`pub fn saturation(&self) -> f64 {`
			`self.bitmap.saturation()`
bloom filter crate 2016-09-29 12:19:39 +02:00			`}`
			`}`

			`/// Bloom journal`
			`/// Returns the tuple of (bloom part index, bloom part value) where each one is representing`
			`/// an index of bloom parts that was updated since the last drain`
			`pub struct BloomJournal {`
			`pub hash_functions: u32,`
			`pub entries: Vec<(usize, u64)>,`
			`}`


separate mod for tests 2016-09-29 12:39:13 +02:00			`#[cfg(test)]`
			`mod tests {`
			`use super::Bloom;`
bloom filter crate 2016-09-29 12:19:39 +02:00
separate mod for tests 2016-09-29 12:39:13 +02:00			`#[test]`
asterisk space 2016-09-30 20:02:16 +02:00			`fn get_set() {`
separate mod for tests 2016-09-29 12:39:13 +02:00			`let mut bloom = Bloom::new(10, 80);`
			`let key = vec![115u8, 99];`
			`assert!(!bloom.check(&key));`
			`bloom.set(&key);`
			`assert!(bloom.check(&key));`
			`}`
bloom filter crate 2016-09-29 12:19:39 +02:00
separate mod for tests 2016-09-29 12:39:13 +02:00			`#[test]`
asterisk space 2016-09-30 20:02:16 +02:00			`fn journalling() {`
separate mod for tests 2016-09-29 12:39:13 +02:00			`let initial = vec![0u64; 8];`
			`let mut bloom = Bloom::from_parts(&initial, 3);`
			`bloom.set(&vec![5u8, 4]);`
			`let drain = bloom.drain_journal();`
bloom filter crate 2016-09-29 12:19:39 +02:00
separate mod for tests 2016-09-29 12:39:13 +02:00			`assert_eq!(2, drain.entries.len())`
			`}`

			`#[test]`
asterisk space 2016-09-30 20:02:16 +02:00			`fn saturation() {`
separate mod for tests 2016-09-29 12:39:13 +02:00			`let initial = vec![0u64; 8];`
			`let mut bloom = Bloom::from_parts(&initial, 3);`
			`bloom.set(&vec![5u8, 4]);`

asterisk space 2016-09-30 20:02:16 +02:00			`let full = bloom.saturation();`
separate mod for tests 2016-09-29 12:39:13 +02:00			`// 2/8/64 = 0.00390625`
			`assert!(full >= 0.0039f64 && full <= 0.004f64);`
			`}`
bloom filter crate 2016-09-29 12:19:39 +02:00			`}`