From e04d58f647c3b2bf0bbf7767fb0f79f78c899461 Mon Sep 17 00:00:00 2001 From: Hawstein Date: Thu, 31 Aug 2017 00:38:05 +0800 Subject: [PATCH] use one hasher in Bloom (#6404) * remove the redundant hasher in Bloom * add the test to check the hash backward compatibility --- util/bloom/src/lib.rs | 49 ++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/util/bloom/src/lib.rs b/util/bloom/src/lib.rs index 91897ed44..4fdb61d40 100644 --- a/util/bloom/src/lib.rs +++ b/util/bloom/src/lib.rs @@ -24,9 +24,6 @@ use std::hash::{Hash, Hasher}; use std::collections::HashSet; use siphasher::sip::SipHasher; -// TODO [ToDr] Both hashers are exactly the same - no point to keep two. -const NUMBER_OF_HASHERS: usize = 2; - /// BitVec structure with journalling /// Every time any of the blocks is getting set it's index is tracked /// and can be then drained by `drain` method @@ -80,8 +77,6 @@ pub struct Bloom { bitmap: BitVecJournal, bitmap_bits: u64, k_num: u32, - // TODO [ToDr] Both hashers are exactly the same - no point to keep two. - sips: [SipHasher; NUMBER_OF_HASHERS], } impl Bloom { @@ -93,12 +88,10 @@ impl Bloom { let bitmap_bits = (bitmap_size as u64) * 8u64; let k_num = Bloom::optimal_k_num(bitmap_bits, items_count); let bitmap = BitVecJournal::new(bitmap_bits as usize); - let sips = [SipHasher::new(), SipHasher::new()]; Bloom { bitmap: bitmap, bitmap_bits: bitmap_bits, k_num: k_num, - sips: sips, } } @@ -107,12 +100,10 @@ impl Bloom { let bitmap_size = parts.len() * 8; let bitmap_bits = (bitmap_size as u64) * 8u64; let bitmap = BitVecJournal::from_parts(parts); - let sips = [SipHasher::new(), SipHasher::new()]; Bloom { bitmap: bitmap, bitmap_bits: bitmap_bits, k_num: k_num, - sips: sips, } } @@ -139,9 +130,9 @@ impl Bloom { pub fn set(&mut self, item: T) where T: Hash { - let mut hashes = [0u64, 0u64]; + let base_hash = Bloom::sip_hash(&item); for k_i in 0..self.k_num { - let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize; + let bit_offset = (Bloom::bloom_hash(base_hash, k_i) % self.bitmap_bits) as usize; self.bitmap.set(bit_offset); } } @@ -151,9 +142,9 @@ impl Bloom { pub fn check(&self, item: T) -> bool where T: Hash { - let mut hashes = [0u64, 0u64]; + let base_hash = Bloom::sip_hash(&item); for k_i in 0..self.k_num { - let bit_offset = (self.bloom_hash(&mut hashes, &item, k_i) % self.bitmap_bits) as usize; + let bit_offset = (Bloom::bloom_hash(base_hash, k_i) % self.bitmap_bits) as usize; if !self.bitmap.get(bit_offset) { return false; } @@ -178,17 +169,20 @@ impl Bloom { cmp::max(k_num, 1) } - fn bloom_hash(&self, hashes: &mut [u64; NUMBER_OF_HASHERS], item: &T, k_i: u32) -> u64 + fn sip_hash(item: &T) -> u64 where T: Hash { - if k_i < NUMBER_OF_HASHERS as u32 { - let mut sip = self.sips[k_i as usize].clone(); - item.hash(&mut sip); - let hash = sip.finish(); - hashes[k_i as usize] = hash; - hash + let mut sip = SipHasher::new(); + item.hash(&mut sip); + let hash = sip.finish(); + hash + } + + fn bloom_hash(base_hash: u64, k_i: u32) -> u64 { + if k_i < 2 { + base_hash } else { - hashes[0].wrapping_add((k_i as u64).wrapping_mul(hashes[1]) % 0xffffffffffffffc5) + base_hash.wrapping_add((k_i as u64).wrapping_mul(base_hash) % 0xffffffffffffffc5) } } @@ -218,6 +212,7 @@ pub struct BloomJournal { #[cfg(test)] mod tests { use super::Bloom; + use std::collections::HashSet; #[test] fn get_set() { @@ -248,4 +243,16 @@ mod tests { // 2/8/64 = 0.00390625 assert!(full >= 0.0039f64 && full <= 0.004f64); } + + #[test] + fn hash_backward_compatibility() { + let ss = vec!["you", "should", "not", "break", "hash", "backward", "compatibility"]; + let mut bloom = Bloom::new(16, 8); + for s in ss.iter() { + bloom.set(&s); + } + let drained_elems: HashSet = bloom.drain_journal().entries.into_iter().map(|t| t.1).collect(); + let expected: HashSet = [2094615114573771027u64, 244675582389208413u64].iter().cloned().collect(); + assert_eq!(drained_elems, expected); + } }