Skip to content

Commit 1220e67

Browse files
authored
Merge pull request #332 from AaronKutch/issue-265
2 parents 557133e + 26fe6ff commit 1220e67

File tree

14 files changed

+2075
-313
lines changed

14 files changed

+2075
-313
lines changed

Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ panic-handler = { path = 'crates/panic-handler' }
4040
[features]
4141
default = ["compiler-builtins"]
4242

43+
# Some algorithms benefit from inline assembly, but some compiler backends do
44+
# not support it, so inline assembly is only enabled when this flag is set.
45+
asm = []
46+
4347
# Enable compilation of C code in compiler-rt, filling in some more optimized
4448
# implementations and also filling in unimplemented intrinsics
4549
c = ["cc"]

src/int/mod.rs

+1-11
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,6 @@
11
use core::ops;
22

3-
macro_rules! hty {
4-
($ty:ty) => {
5-
<$ty as LargeInt>::HighHalf
6-
};
7-
}
8-
9-
macro_rules! os_ty {
10-
($ty:ty) => {
11-
<$ty as Int>::OtherSign
12-
};
13-
}
3+
mod specialized_div_rem;
144

155
pub mod addsub;
166
pub mod leading_zeros;

src/int/sdiv.rs

+39-75
Original file line numberDiff line numberDiff line change
@@ -1,101 +1,65 @@
1-
use int::Int;
2-
3-
trait Div: Int {
4-
/// Returns `a / b`
5-
fn div(self, other: Self) -> Self {
6-
let s_a = self >> (Self::BITS - 1);
7-
let s_b = other >> (Self::BITS - 1);
8-
// NOTE it's OK to overflow here because of the `.unsigned()` below.
9-
// This whole operation is computing the absolute value of the inputs
10-
// So some overflow will happen when dealing with e.g. `i64::MIN`
11-
// where the absolute value is `(-i64::MIN) as u64`
12-
let a = (self ^ s_a).wrapping_sub(s_a);
13-
let b = (other ^ s_b).wrapping_sub(s_b);
14-
let s = s_a ^ s_b;
15-
16-
let r = a.unsigned().aborting_div(b.unsigned());
17-
(Self::from_unsigned(r) ^ s) - s
18-
}
19-
}
20-
21-
impl Div for i32 {}
22-
impl Div for i64 {}
23-
impl Div for i128 {}
24-
25-
trait Mod: Int {
26-
/// Returns `a % b`
27-
fn mod_(self, other: Self) -> Self {
28-
let s = other >> (Self::BITS - 1);
29-
// NOTE(wrapping_sub) see comment in the `div`
30-
let b = (other ^ s).wrapping_sub(s);
31-
let s = self >> (Self::BITS - 1);
32-
let a = (self ^ s).wrapping_sub(s);
33-
34-
let r = a.unsigned().aborting_rem(b.unsigned());
35-
(Self::from_unsigned(r) ^ s) - s
36-
}
37-
}
38-
39-
impl Mod for i32 {}
40-
impl Mod for i64 {}
41-
impl Mod for i128 {}
42-
43-
trait Divmod: Int {
44-
/// Returns `a / b` and sets `*rem = n % d`
45-
fn divmod<F>(self, other: Self, rem: &mut Self, div: F) -> Self
46-
where
47-
F: Fn(Self, Self) -> Self,
48-
{
49-
let r = div(self, other);
50-
// NOTE won't overflow because it's using the result from the
51-
// previous division
52-
*rem = self - r.wrapping_mul(other);
53-
r
54-
}
55-
}
56-
57-
impl Divmod for i32 {}
58-
impl Divmod for i64 {}
1+
use int::specialized_div_rem::*;
592

603
intrinsics! {
614
#[maybe_use_optimized_c_shim]
625
#[arm_aeabi_alias = __aeabi_idiv]
6+
/// Returns `n / d`
637
pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 {
64-
a.div(b)
8+
i32_div_rem(a, b).0
659
}
6610

6711
#[maybe_use_optimized_c_shim]
68-
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
69-
a.div(b)
12+
/// Returns `n % d`
13+
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
14+
i32_div_rem(a, b).1
7015
}
7116

72-
#[win64_128bit_abi_hack]
73-
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
74-
a.div(b)
17+
#[maybe_use_optimized_c_shim]
18+
/// Returns `n / d` and sets `*rem = n % d`
19+
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
20+
let quo_rem = i32_div_rem(a, b);
21+
*rem = quo_rem.1;
22+
quo_rem.0
7523
}
7624

7725
#[maybe_use_optimized_c_shim]
78-
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 {
79-
a.mod_(b)
26+
/// Returns `n / d`
27+
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 {
28+
i64_div_rem(a, b).0
8029
}
8130

8231
#[maybe_use_optimized_c_shim]
32+
/// Returns `n % d`
8333
pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 {
84-
a.mod_(b)
34+
i64_div_rem(a, b).1
35+
}
36+
37+
#[maybe_use_optimized_c_shim]
38+
/// Returns `n / d` and sets `*rem = n % d`
39+
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
40+
let quo_rem = i64_div_rem(a, b);
41+
*rem = quo_rem.1;
42+
quo_rem.0
8543
}
8644

8745
#[win64_128bit_abi_hack]
88-
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
89-
a.mod_(b)
46+
/// Returns `n / d`
47+
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 {
48+
i128_div_rem(a, b).0
9049
}
9150

92-
#[maybe_use_optimized_c_shim]
93-
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 {
94-
a.divmod(b, rem, |a, b| __divsi3(a, b))
51+
#[win64_128bit_abi_hack]
52+
/// Returns `n % d`
53+
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 {
54+
i128_div_rem(a, b).1
9555
}
9656

97-
#[aapcs_on_arm]
98-
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 {
99-
a.divmod(b, rem, |a, b| __divdi3(a, b))
57+
// LLVM does not currently have a `__divmodti4` function, but GCC does
58+
#[maybe_use_optimized_c_shim]
59+
/// Returns `n / d` and sets `*rem = n % d`
60+
pub extern "C" fn __divmodti4(a: i128, b: i128, rem: &mut i128) -> i128 {
61+
let quo_rem = i128_div_rem(a, b);
62+
*rem = quo_rem.1;
63+
quo_rem.0
10064
}
10165
}
+169
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
/// Creates unsigned and signed division functions optimized for dividing integers with the same
2+
/// bitwidth as the largest operand in an asymmetrically sized division. For example, x86-64 has an
3+
/// assembly instruction that can divide a 128 bit integer by a 64 bit integer if the quotient fits
4+
/// in 64 bits. The 128 bit version of this algorithm would use that fast hardware division to
5+
/// construct a full 128 bit by 128 bit division.
6+
#[macro_export]
7+
macro_rules! impl_asymmetric {
8+
(
9+
$unsigned_name:ident, // name of the unsigned division function
10+
$signed_name:ident, // name of the signed division function
11+
$zero_div_fn:ident, // function called when division by zero is attempted
12+
$half_division:ident, // function for division of a $uX by a $uX
13+
$asymmetric_division:ident, // function for division of a $uD by a $uX
14+
$n_h:expr, // the number of bits in a $iH or $uH
15+
$uH:ident, // unsigned integer with half the bit width of $uX
16+
$uX:ident, // unsigned integer with half the bit width of $uD
17+
$uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name`
18+
$iD:ident, // signed integer type for the inputs and outputs of `$signed_name`
19+
$($unsigned_attr:meta),*; // attributes for the unsigned function
20+
$($signed_attr:meta),* // attributes for the signed function
21+
) => {
22+
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
23+
/// tuple.
24+
$(
25+
#[$unsigned_attr]
26+
)*
27+
pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) {
28+
fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) {
29+
let tmp = (lhs as $uD).wrapping_mul(rhs as $uD);
30+
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
31+
}
32+
fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) {
33+
let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD);
34+
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX)
35+
}
36+
37+
let n: u32 = $n_h * 2;
38+
39+
// Many of these subalgorithms are taken from trifecta.rs, see that for better
40+
// documentation.
41+
42+
let duo_lo = duo as $uX;
43+
let duo_hi = (duo >> n) as $uX;
44+
let div_lo = div as $uX;
45+
let div_hi = (div >> n) as $uX;
46+
if div_hi == 0 {
47+
if div_lo == 0 {
48+
$zero_div_fn()
49+
}
50+
if duo_hi < div_lo {
51+
// `$uD` by `$uX` division with a quotient that will fit into a `$uX`
52+
let (quo, rem) = unsafe { $asymmetric_division(duo, div_lo) };
53+
return (quo as $uD, rem as $uD)
54+
} else if (div_lo >> $n_h) == 0 {
55+
// Short division of $uD by a $uH.
56+
57+
// Some x86_64 CPUs have bad division implementations that make specializing
58+
// this case faster.
59+
let div_0 = div_lo as $uH as $uX;
60+
let (quo_hi, rem_3) = $half_division(duo_hi, div_0);
61+
62+
let duo_mid =
63+
((duo >> $n_h) as $uH as $uX)
64+
| (rem_3 << $n_h);
65+
let (quo_1, rem_2) = $half_division(duo_mid, div_0);
66+
67+
let duo_lo =
68+
(duo as $uH as $uX)
69+
| (rem_2 << $n_h);
70+
let (quo_0, rem_1) = $half_division(duo_lo, div_0);
71+
72+
return (
73+
(quo_0 as $uD)
74+
| ((quo_1 as $uD) << $n_h)
75+
| ((quo_hi as $uD) << n),
76+
rem_1 as $uD
77+
)
78+
} else {
79+
// Short division using the $uD by $uX division
80+
let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo);
81+
let tmp = unsafe {
82+
$asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo)
83+
};
84+
return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD)
85+
}
86+
}
87+
88+
let duo_lz = duo_hi.leading_zeros();
89+
let div_lz = div_hi.leading_zeros();
90+
let rel_leading_sb = div_lz.wrapping_sub(duo_lz);
91+
if rel_leading_sb < $n_h {
92+
// Some x86_64 CPUs have bad hardware division implementations that make putting
93+
// a two possibility algorithm here beneficial. We also avoid a full `$uD`
94+
// multiplication.
95+
let shift = n - duo_lz;
96+
let duo_sig_n = (duo >> shift) as $uX;
97+
let div_sig_n = (div >> shift) as $uX;
98+
let quo = $half_division(duo_sig_n, div_sig_n).0;
99+
let div_lo = div as $uX;
100+
let div_hi = (div >> n) as $uX;
101+
let (tmp_lo, carry) = carrying_mul(quo, div_lo);
102+
let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry);
103+
let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n);
104+
if (overflow != 0) || (duo < tmp) {
105+
return (
106+
(quo - 1) as $uD,
107+
duo.wrapping_add(div).wrapping_sub(tmp)
108+
)
109+
} else {
110+
return (
111+
quo as $uD,
112+
duo - tmp
113+
)
114+
}
115+
} else {
116+
// This has been adapted from
117+
// https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn
118+
// adapted from Hacker's Delight. This is similar to the two possibility algorithm
119+
// in that it uses only more significant parts of `duo` and `div` to divide a large
120+
// integer with a smaller division instruction.
121+
122+
let div_extra = n - div_lz;
123+
let div_sig_n = (div >> div_extra) as $uX;
124+
let tmp = unsafe {
125+
$asymmetric_division(duo >> 1, div_sig_n)
126+
};
127+
128+
let mut quo = tmp.0 >> ((n - 1) - div_lz);
129+
if quo != 0 {
130+
quo -= 1;
131+
}
132+
133+
// Note that this is a full `$uD` multiplication being used here
134+
let mut rem = duo - (quo as $uD).wrapping_mul(div);
135+
if div <= rem {
136+
quo += 1;
137+
rem -= div;
138+
}
139+
return (quo as $uD, rem)
140+
}
141+
}
142+
143+
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
144+
/// tuple.
145+
$(
146+
#[$signed_attr]
147+
)*
148+
pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) {
149+
match (duo < 0, div < 0) {
150+
(false, false) => {
151+
let t = $unsigned_name(duo as $uD, div as $uD);
152+
(t.0 as $iD, t.1 as $iD)
153+
},
154+
(true, false) => {
155+
let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD);
156+
((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg())
157+
},
158+
(false, true) => {
159+
let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD);
160+
((t.0 as $iD).wrapping_neg(), t.1 as $iD)
161+
},
162+
(true, true) => {
163+
let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD);
164+
(t.0 as $iD, (t.1 as $iD).wrapping_neg())
165+
},
166+
}
167+
}
168+
}
169+
}

0 commit comments

Comments
 (0)