diff --git a/util/benches/bigint.rs b/util/benches/bigint.rs
index 524d31508..38ce10a4a 100644
--- a/util/benches/bigint.rs
+++ b/util/benches/bigint.rs
@@ -21,6 +21,7 @@
 //! ```
 
 #![feature(test)]
+#![feature(asm)]
 
 extern crate test;
 extern crate ethcore_util;
@@ -40,7 +41,24 @@ fn u256_add(b: &mut Bencher) {
 fn u256_sub(b: &mut Bencher) {
 	b.iter(|| {
 		let n = black_box(10000);
-		(0..n).fold(U256::zero(), |old, new| { old.overflowing_add(U256::from(new)).0 })
+		(0..n).fold(U256::zero(), |old, new| { old.overflowing_sub(U256::from(new)).0 })
+	});
+}
+
+#[bench]
+fn u256_mul(b: &mut Bencher) {
+	b.iter(|| {
+		let n = black_box(10000);
+		(0..n).fold(U256([12345u64, 0u64, 0u64, 0u64]), |old, new| { old.overflowing_mul(U256::from(new)).0 })
+	});
+}
+
+
+#[bench]
+fn u128_mul(b: &mut Bencher) {
+	b.iter(|| {
+		let n = black_box(10000);
+		(0..n).fold(U128([12345u64, 0u64]), |old, new| { old.overflowing_mul(U128::from(new)).0 })
 	});
 }
 
diff --git a/util/src/uint.rs b/util/src/uint.rs
index 147b83e42..38b4e4906 100644
--- a/util/src/uint.rs
+++ b/util/src/uint.rs
@@ -41,8 +41,6 @@ use from_json::*;
 use rustc_serialize::hex::ToHex;
 use serde;
 
-#[cfg_attr(x64_asm_optimizations, all(feature = "dev", target_arch = "x86_64"))]
-
 macro_rules! impl_map_from {
 	($thing:ident, $from:ty, $to:ty) => {
 		impl From<$from> for $thing {
@@ -53,7 +51,8 @@ macro_rules! impl_map_from {
 	}
 }
 
-macro_rules! overflowing_add_regular {
+#[cfg(not(all(feature="dev", target_arch = "x86_64")))]
+macro_rules! uint_overflowing_add {
 	($name:ident, $n_words:expr, $self_expr: expr, $other: expr) => ({
 		let $name(ref me) = $self_expr;
 		let $name(ref you) = $other;
@@ -83,7 +82,8 @@ macro_rules! overflowing_add_regular {
 	})
 }
 
-macro_rules! add_64x_optimized {
+#[cfg(all(feature="dev", target_arch = "x86_64"))]
+macro_rules! uint_overflowing_add {
 	(U256, $n_words: expr, $self_expr: expr, $other: expr) => ({
 		let mut result: [u64; 4] = unsafe { mem::uninitialized() };
 		let self_t: &[u64; 4] = unsafe { &mem::transmute($self_expr) };
@@ -110,7 +110,17 @@ macro_rules! add_64x_optimized {
 	)
 }
 
-macro_rules! sub_64x_optimized {
+#[cfg(not(all(feature="dev", target_arch = "x86_64")))]
+macro_rules! uint_overflowing_sub {
+	($name:ident, $n_words: expr, $self_expr: expr, $other: expr) => ({
+		let res = overflowing!((!$other).overflowing_add(From::from(1u64)));
+		let res = overflowing!($self_expr.overflowing_add(res));
+		(res, $self_expr < $other)
+	})
+}
+
+#[cfg(all(feature="dev", target_arch = "x86_64"))]
+macro_rules! uint_overflowing_sub {
 	(U256, $n_words: expr, $self_expr: expr, $other: expr) => ({
 		let mut result: [u64; 4] = unsafe { mem::uninitialized() };
 		let self_t: &[u64; 4] = unsafe { &mem::transmute($self_expr) };
@@ -137,6 +147,119 @@ macro_rules! sub_64x_optimized {
 	)
 }
 
+#[cfg(all(feature="dev", target_arch = "x86_64"))]
+macro_rules! uint_overflowing_mul {
+	(U256, $n_words: expr, $self_expr: expr, $other: expr) => ({
+		let mut result: [u64; 4] = unsafe { mem::uninitialized() };
+		let self_t: &[u64; 4] = unsafe { &mem::transmute($self_expr) };
+		let other_t: &[u64; 4] = unsafe { &mem::transmute($other) };
+
+		let overflow: u8;
+		unsafe {
+			asm!("
+				mov $5, %rax
+				mulq $9
+				mov %rax, %r8
+				adc $6, %rdx
+				pushf
+
+				mov %rdx, %rax
+				mulq $9
+				popf
+				adc $$0, %rax
+				adc $7, %rdx
+				pushf
+				mov %rax, %r9
+
+
+				mov %rdx, %rax
+				mulq $9
+				popf
+				adc $$0, %rax
+				adc $8, %rdx
+				pushf
+				mov %rax, %r10
+
+				mov %rdx, %rax
+				mulq $9
+				popf
+				adc $$0, %rax
+				mov %rax, %r11
+				mov %rdx, %rcx
+
+				mov $5, %rax
+				mulq $10
+				adc %rax, %r9
+				adc $6, %rdx
+				pushf
+
+				mov %rdx, %rax
+				mulq $10
+				popf
+				adc %rax, %r10
+				adc $7, %rdx
+				pushf
+
+				mov %rdx, %rax
+				mulq $10
+				popf
+				adc %rax, %r11
+				pushf
+				or %rax, %rcx
+
+				mov $5, %rax
+				mulq $11
+				popf
+				adc %rax, %r10
+				adc $6, %rdx
+				pushf
+
+				mov %rdx, %rax
+				mulq $11
+				popf
+				adc %rax, %r11
+				pushf
+				or %rdx, %rcx
+
+				mov $5, %rax
+				mulq $12
+				popf
+				adc %rax, %r11
+			    or %rdx, %rcx
+                "
+				: /* $0 */ "={r8}"(result[0]), /* $1 */ "={r9}"(result[1]), /* $2 */ "={r10}"(result[2]),
+				  /* $3 */ "={r11}"(result[3]), /* $4 */  "={rcx}"(overflow)
+
+				: /* $5 */ "m"(self_t[0]), /* $6 */ "m"(self_t[1]), /* $7 */  "m"(self_t[2]),
+				  /* $8 */ "m"(self_t[3]), /* $9 */ "m"(other_t[0]), /* $10 */ "m"(other_t[1]),
+				  /* $11 */ "m"(other_t[2]), /* $12 */ "m"(other_t[3])
+				: "rax", "rdx"
+				:
+
+			);
+		}
+		(U256(result), overflow > 0)
+	});
+	($name:ident, $n_words:expr, $self_expr: expr, $other: expr) => (
+		overflowing_mul_regular!($name, $n_words, $self_expr, $other)
+	)
+}
+
+#[cfg(not(all(feature="dev", target_arch = "x86_64")))]
+macro_rules! uint_overflowing_mul {
+	($name:ident, $n_words: expr, $self_expr: expr, $other: expr) => ({
+		let mut res = $name::from(0u64);
+		let mut overflow = false;
+		// TODO: be more efficient about this
+		for i in 0..(2 * $n_words) {
+			let v = overflowing!($self_expr.overflowing_mul_u32(($other >> (32 * i)).low_u32()), overflow);
+			let res2 = overflowing!(v.overflowing_shl(32 * i as u32), overflow);
+			res = overflowing!(res.overflowing_add(res2), overflow);
+		}
+		(res, overflow)
+	})
+}
+
 macro_rules! overflowing {
 	($op: expr, $overflow: expr) => (
 		{
@@ -384,38 +507,16 @@ macro_rules! construct_uint {
 			}
 
 			/// Optimized instructions
-			#[cfg(x64_asm_optimizations)]
-			#[inline]
 			fn overflowing_add(self, other: $name) -> ($name, bool) {
-				add_64x_optimized!($name, $n_words, self, other)
-			}
-			#[cfg(not(x64_asm_optimizations))]
-			fn overflowing_add(self, other: $name) -> ($name, bool) {
-				overflowing_add_regular!($name, $n_words, self, other)
+				uint_overflowing_add!($name, $n_words, self, other)
 			}
 
-			#[cfg(x64_asm_optimizations)]
-			#[inline]
 			fn overflowing_sub(self, other: $name) -> ($name, bool) {
-				sub_64x_optimized!($name, $n_words, self, other)
-			}
-			#[cfg(not(x64_asm_optimizations))]
-			fn overflowing_sub(self, other: $name) -> ($name, bool) {
-				let res = overflowing!((!other).overflowing_add(From::from(1u64)));
-				let res = overflowing!(self.overflowing_add(res));
-				(res, self < other)
+				uint_overflowing_sub!($name, $n_words, self, other)
 			}
 
 			fn overflowing_mul(self, other: $name) -> ($name, bool) {
-				let mut res = $name::from(0u64);
-				let mut overflow = false;
-				// TODO: be more efficient about this
-				for i in 0..(2 * $n_words) {
-					let v = overflowing!(self.overflowing_mul_u32((other >> (32 * i)).low_u32()), overflow);
-					let res2 = overflowing!(v.overflowing_shl(32 * i as u32), overflow);
-					res = overflowing!(res.overflowing_add(res2), overflow);
-				}
-				(res, overflow)
+				uint_overflowing_mul!($name, $n_words, self, other)
 			}
 
 			fn overflowing_div(self, other: $name) -> ($name, bool) {