Try #236:

bors[bot] · web-flow · commit e18adb368041 · 2022-02-24T19:18:22.000Z
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -18,6 +18,7 @@ jobs:
           1.36.0, # alloc, rand
           1.40.0, # arbitrary
           1.46.0, # quickcheck
+          1.59.0, # asm!
           stable,
           beta,
           nightly
diff --git a/bors.toml b/bors.toml
@@ -4,6 +4,7 @@ status = [
   "Test (1.36.0)",
   "Test (1.40.0)",
   "Test (1.46.0)",
+  "Test (1.59.0)",
   "Test (stable)",
   "Test (beta)",
   "Test (nightly)",
diff --git a/build.rs b/build.rs
@@ -10,6 +10,7 @@ fn main() {
     if u64_digit {
         autocfg::emit("u64_digit");
     }
+
     let ac = autocfg::new();
     let std = if ac.probe_sysroot_crate("std") {
         "std"
@@ -28,6 +29,10 @@ fn main() {
             if ac.probe_path(&addcarry) {
                 autocfg::emit("use_addcarry");
             }
+
+            if ac.probe_path(&format!("{}::arch::asm", std)) {
+                autocfg::emit("use_x86_div");
+            }
         }
     }
 
diff --git a/src/biguint/convert.rs b/src/biguint/convert.rs
@@ -657,16 +657,23 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {
 
     let mut digits = u.clone();
 
-    let (base, power) = get_radix_base(radix, big_digit::HALF_BITS);
+    // X86 DIV can quickly divide by a full digit, otherwise we choose a divisor
+    // that's suitable for `div_half` to avoid slow `DoubleBigDigit` division.
+    let bits = if cfg!(use_x86_div) {
+        big_digit::BITS
+    } else {
+        big_digit::HALF_BITS
+    };
+    let (base, power) = get_radix_base(radix, bits);
     let radix = radix as BigDigit;
 
     // For very large numbers, the O(n²) loop of repeated `div_rem_digit` dominates the
     // performance. We can mitigate this by dividing into chunks of a larger base first.
     // The threshold for this was chosen by anecdotal performance measurements to
     // approximate where this starts to make a noticeable difference.
     if digits.data.len() >= 64 {
-        let mut big_base = BigUint::from(base * base);
-        let mut big_power = 2usize;
+        let mut big_base = BigUint::from(base);
+        let mut big_power = 1usize;
 
         // Choose a target base length near √n.
         let target_len = digits.data.len().sqrt();
diff --git a/src/biguint/division.rs b/src/biguint/division.rs
@@ -18,6 +18,7 @@ use num_traits::{CheckedDiv, One, ToPrimitive, Zero};
 /// This is _not_ true for an arbitrary numerator/denominator.
 ///
 /// (This function also matches what the x86 divide instruction does).
+#[cfg(not(use_x86_div))]
 #[inline]
 fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
     debug_assert!(hi < divisor);
@@ -27,6 +28,44 @@ fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigi
     ((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit)
 }
 
+/// With Rust 1.59+ for stable `asm!`, x86 and x86_64 can use a real `div` instruction.
+#[cfg(use_x86_div)]
+#[inline]
+fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
+    // This debug assertion covers the potential #DE for divisor==0 or a quotient too large for one
+    // register, otherwise in release mode it will become a target-specific fault like SIGFPE.
+    // This should never occur with the inputs from our few `div_wide` callers.
+    debug_assert!(hi < divisor);
+
+    // SAFETY: The `div` instruction only affects registers, reading the explicit operand as the
+    // divisor, and implicitly reading RDX:RAX or EDX:EAX as the dividend. The result is implicitly
+    // written back to RAX or EAX for the quotient and RDX or EDX for the remainder. No memory is
+    // used, and flags are not preserved.
+    unsafe {
+        let (div, rem);
+
+        #[cfg(u64_digit)]
+        core::arch::asm!(
+            "div {:r}",
+            in(reg) divisor,
+            inout("rdx") hi => rem,
+            inout("rax") lo => div,
+            options(pure, nomem, nostack),
+        );
+
+        #[cfg(not(u64_digit))]
+        core::arch::asm!(
+            "div {:e}",
+            in(reg) divisor,
+            inout("edx") hi => rem,
+            inout("eax") lo => div,
+            options(pure, nomem, nostack),
+        );
+
+        (div, rem)
+    }
+}
+
 /// For small divisors, we can divide without promoting to `DoubleBigDigit` by
 /// using half-size pieces of digit, like long-division.
 #[inline]
@@ -47,7 +86,7 @@ pub(super) fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit)
 
     let mut rem = 0;
 
-    if b <= big_digit::HALF {
+    if !cfg!(use_x86_div) && b <= big_digit::HALF {
         for d in a.data.iter_mut().rev() {
             let (q, r) = div_half(rem, *d, b);
             *d = q;
@@ -72,7 +111,7 @@ fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit {
 
     let mut rem = 0;
 
-    if b <= big_digit::HALF {
+    if !cfg!(use_x86_div) && b <= big_digit::HALF {
         for &digit in a.data.iter().rev() {
             let (_, r) = div_half(rem, digit, b);
             rem = r;
@@ -232,7 +271,7 @@ fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) {
     let mut a0 = 0;
 
     // [b1, b0] are the two most significant digits of the divisor. They never change.
-    let b0 = *b.last().unwrap();
+    let b0 = b[b.len() - 1];
     let b1 = b[b.len() - 2];
 
     let q_len = a.data.len() - b.len() + 1;

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@ fn main() {`
`10`	`10`	`if u64_digit {`
`11`	`11`	`autocfg::emit("u64_digit");`
`12`	`12`	`}`
	`13`	`+`
`13`	`14`	`let ac = autocfg::new();`
`14`	`15`	`let std = if ac.probe_sysroot_crate("std") {`
`15`	`16`	`"std"`
`@@ -28,6 +29,10 @@ fn main() {`
`28`	`29`	`if ac.probe_path(&addcarry) {`
`29`	`30`	`autocfg::emit("use_addcarry");`
`30`	`31`	`}`
	`32`	`+`
	`33`	`+ if ac.probe_path(&format!("{}::arch::asm", std)) {`
	`34`	`+ autocfg::emit("use_x86_div");`
	`35`	`+ }`
`31`	`36`	`}`
`32`	`37`	`}`
`33`	`38`