Couple of more aggresive optimizations

This commit is contained in:
Tomasz Drwięga 2016-03-08 10:05:46 +01:00
parent 17b2d2a2d7
commit 9ae2341ba9

View File

@ -395,28 +395,31 @@ macro_rules! uint_overflowing_mul_reg {
let mut ret = [0u64; 2*$n_words]; let mut ret = [0u64; 2*$n_words];
for i in 0..$n_words { for i in 0..$n_words {
if you[i] == 0 {
continue;
}
let mut carry2 = 0u64; let mut carry2 = 0u64;
let (b_u, b_l) = (you[i] >> 32, you[i] & 0xFFFFFFFF); let (b_u, b_l) = split(you[i]);
for j in 0..$n_words { for j in 0..$n_words {
let a = me[j]; if me[j] == 0 {
continue;
}
let a = split(me[j]);
// multiply parts // multiply parts
let (c_l, overflow_l) = mul_u32(a, b_l as u32, ret[j + i]); let (c_l, overflow_l) = mul_u32(a, b_l, ret[i + j]);
let (c_u, overflow_u) = mul_u32(a, b_u as u32, c_l >> 32); let (c_u, overflow_u) = mul_u32(a, b_u, c_l >> 32);
ret[i + j] = (c_l & 0xFFFFFFFF) + (c_u << 32);
// This won't overflow // Only single overflow possible here
ret[j + i] = (c_l & 0xFFFFFFFF) + (c_u << 32); let carry = (c_u >> 32) + (overflow_u << 32) + overflow_l + carry2;
let (carry, o) = carry.overflowing_add(ret[i + j + 1]);
// carry1 = overflow_l + (c_u >> 32) + (overflow_u << 32) + carry2 + c0; ret[i + j + 1] = carry;
let (ca1, c1) = overflow_l.overflowing_add((c_u >> 32) + (overflow_u << 32)); carry2 = o as u64;
let (ca1, c2) = ca1.overflowing_add(ret[j + i + 1]);
let (ca1, c3) = ca1.overflowing_add(carry2);
ret[j + i + 1] = ca1;
// Will never overflow
carry2 = (overflow_u >> 32) + c1 as u64 + c2 as u64 + c3 as u64;
} }
} }
@ -459,10 +462,9 @@ macro_rules! panic_on_overflow {
} }
#[inline(always)] #[inline(always)]
fn mul_u32(a: u64, b: u32, carry: u64) -> (u64, u64) { fn mul_u32(a: (u64, u64), b: u64, carry: u64) -> (u64, u64) {
let b = b as u64; let upper = b * a.0;
let upper = b * (a >> 32); let lower = b * a.1;
let lower = b * (a & 0xFFFFFFFF);
let (res1, overflow1) = lower.overflowing_add(upper << 32); let (res1, overflow1) = lower.overflowing_add(upper << 32);
let (res2, overflow2) = res1.overflowing_add(carry); let (res2, overflow2) = res1.overflowing_add(carry);
@ -471,6 +473,11 @@ fn mul_u32(a: u64, b: u32, carry: u64) -> (u64, u64) {
(res2, carry) (res2, carry)
} }
#[inline(always)]
fn split(a: u64) -> (u64, u64) {
(a >> 32, a & 0xFFFFFFFF)
}
/// Large, fixed-length unsigned integer type. /// Large, fixed-length unsigned integer type.
pub trait Uint: Sized + Default + FromStr + From<u64> + fmt::Debug + fmt::Display + PartialOrd + Ord + PartialEq + Eq + Hash { pub trait Uint: Sized + Default + FromStr + From<u64> + fmt::Debug + fmt::Display + PartialOrd + Ord + PartialEq + Eq + Hash {
@ -734,9 +741,10 @@ macro_rules! construct_uint {
let $name(ref arr) = self; let $name(ref arr) = self;
let mut ret = [0u64; $n_words]; let mut ret = [0u64; $n_words];
let mut carry = 0; let mut carry = 0;
let o = other as u64;
for i in 0..$n_words { for i in 0..$n_words {
let (res, carry2) = mul_u32(arr[i], other, carry); let (res, carry2) = mul_u32(split(arr[i]), o, carry);
ret[i] = res; ret[i] = res;
carry = carry2; carry = carry2;
} }
@ -1255,28 +1263,31 @@ impl U256 {
let mut ret = [0u64; 8]; let mut ret = [0u64; 8];
for i in 0..4 { for i in 0..4 {
if you[i] == 0 {
continue;
}
let mut carry2 = 0u64; let mut carry2 = 0u64;
let (b_u, b_l) = (you[i] >> 32, you[i] & 0xFFFFFFFF); let (b_u, b_l) = split(you[i]);
for j in 0..4 { for j in 0..4 {
let a = me[j]; if me[j] == 0 {
continue;
}
let a = split(me[j]);
// multiply parts // multiply parts
let (c_l, overflow_l) = mul_u32(a, b_l as u32, ret[j + i]); let (c_l, overflow_l) = mul_u32(a, b_l, ret[i + j]);
let (c_u, overflow_u) = mul_u32(a, b_u as u32, c_l >> 32); let (c_u, overflow_u) = mul_u32(a, b_u, c_l >> 32);
ret[i + j] = (c_l & 0xFFFFFFFF) + (c_u << 32);
// This won't overflow // Only single overflow possible here
ret[j + i] = (c_l & 0xFFFFFFFF) + (c_u << 32); let carry = (c_u >> 32) + (overflow_u << 32) + overflow_l + carry2;
let (carry, o) = carry.overflowing_add(ret[i + j + 1]);
// carry1 = overflow_l + (c_u >> 32) + (overflow_u << 32) + carry2 + c0; ret[i + j + 1] = carry;
let (ca1, c1) = overflow_l.overflowing_add((c_u >> 32) + (overflow_u << 32)); carry2 = o as u64;
let (ca1, c2) = ca1.overflowing_add(ret[j + i + 1]);
let (ca1, c3) = ca1.overflowing_add(carry2);
ret[j + i + 1] = ca1;
// Will never overflow
carry2 = (overflow_u >> 32) + c1 as u64 + c2 as u64 + c3 as u64;
} }
} }