Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hardcoded flint_mpn_aors_n for ARM and x86 #2118

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions dev/gen_arm_aors.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#
# Copyright (C) 2024 Albin Ahlbäck
#
# This file is part of FLINT.
#
# FLINT is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License (LGPL) as published
# by the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version. See <https://www.gnu.org/licenses/>.
#

# Generating routines for r <- a OP b, where OP is either + or -.
#
# This generation was constructed with processors with Apple silicon in mind.
# Processors decoding less than 6 operations per cycle, or few store and load
# units may have worse performance.

r = "rp"
a = "ap"
b = "bp"
rp(ix::Int) = "[$r,#$ix*8]"
ap(ix::Int) = "[$a,#$ix*8]"
bp(ix::Int) = "[$b,#$ix*8]"

sx = "sx" # Return value for carry or borrow
CC = "CC"

sp = ["s$ix" for ix in 0:14] # Scrap registers

# Writes assembly that should be preprocessed by M4.
function aors(n::Int)
_str = "PROLOGUE(flint_mpn_aors($n))\n"
function ldr(s0::String, s1::String)
_str *= "\tldr\t$s0, $s1\n"
end
function ldp(s0::String, s1::String, s2::String)
_str *= "\tldp\t$s0, $s1, $s2\n"
end
function str(s0::String, s1::String)
_str *= "\tstr\t$s0, $s1\n"
end
function stp(s0::String, s1::String, s2::String)
_str *= "\tstp\t$s0, $s1, $s2\n"
end
function OP(s0::String, s1::String, s2::String)
_str *= "\tOP\t$s0, $s1, $s2\n"
end
function OPC(s0::String, s1::String, s2::String)
_str *= "\tOPC\t$s0, $s1, $s2\n"
end
function cset(s0::String, s1::String)
_str *= "\tcset\t$s0, $s1\n"
end

sv = deepcopy(sp)
s(ix::Int) = sv[ix + 1]
function shift(sv::Vector{String})
sv[(end - 3):end], sv[1:(end - 4)] = sv[1:4], sv[5:end]
end

ldp( s(0), s(2), ap(0))
ldp( s(1), s(3), bp(0))
OP( s(0), s(0), s(1))
OPC( s(2), s(2), s(3))
stp( s(0), s(2), rp(0))

for ix in 1:(n ÷ 2 - 1)
shift(sv)
ldp( s(0), s(2), ap(2 * ix))
ldp( s(1), s(3), bp(2 * ix))
OPC( s(0), s(0), s(1))
OPC( s(2), s(2), s(3))
stp( s(0), s(2), rp(2 * ix))
end

if n % 2 == 1
ldr( s(4), ap(n - 1))
ldr( s(5), bp(n - 1))
OPC( s(4), s(4), s(5))
str( s(4), rp(n - 1))
end

cset( sx, CC)

_str *= "\tret\nEPILOGUE()\n"

return _str
end

function print_all_aors(nmax::Int = 16)
for n in 2:nmax
println(aors(n))
end
end
83 changes: 83 additions & 0 deletions dev/gen_x86_aors.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#
# Copyright (C) 2024 Albin Ahlbäck
#
# This file is part of FLINT.
#
# FLINT is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License (LGPL) as published
# by the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version. See <https://www.gnu.org/licenses/>.
#

# Generating routines for r <- a OP b, where OP is either + or -.
#
# This generation was constructed with processors with descent schedulers in
# mind.

r = "rp"
a = "ap"
b = "bp"
rp(ix::Int) = "$ix*8($r)"
ap(ix::Int) = "$ix*8($a)"
bp(ix::Int) = "$ix*8($b)"

sx = "sx" # Return value for carry or borrow, i.e. %rax

R32(sx::String) = "R32($sx)"
R8(sx::String) = "R8($sx)"

sp = ["s$ix" for ix in 0:4] # Scrap registers

# Writes assembly that should be preprocessed by M4.
function aors(n::Int)
str = "\tALIGN(16)\nPROLOGUE(flint_mpn_aors($n))\n"
function mov(s0::String, s1::String)
str *= "\tmov\t$s0, $s1\n"
end
function xor(s0::String, s1::String)
str *= "\txor\t$s0, $s1\n"
end
function OP(s0::String, s1::String)
str *= "\tOP\t$s0, $s1\n"
end
function OPC(s0::String, s1::String)
str *= "\tOPC\t$s0, $s1\n"
end
function setc(s0::String)
str *= "\tsetc\t$s0\n"
end

sv = deepcopy(sp)
s(ix::Int) = sv[ix + 1]
function shift(sv::Vector{String})
sv[end], sv[1:end - 1] = sv[1], sv[2:end]
end

mov( ap(0), s(0))

mov( ap(1), s(1))
xor( R32(sx), R32(sx))
OP( bp(0), s(0))
mov( s(0), rp(0))

for ix in 1:(n - 2)
shift(sv)
mov( ap(ix + 1), s(1))
OPC( bp(ix), s(0))
mov( s(0), rp(ix))
end

OPC( bp(n - 1), s(1))
mov( s(1), rp(n - 1))
setc( R8(sx))

str *= "\tret\nEPILOGUE()\n"

return str
end

function print_all_aors(nmax::Int = 16)
for n in 2:nmax
println(aors(n))
end
end
159 changes: 159 additions & 0 deletions dev/gen_x86_aorsrsh.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#
# Copyright (C) 2024 Albin Ahlbäck
#
# This file is part of FLINT.
#
# FLINT is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License (LGPL) as published
# by the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version. See <https://www.gnu.org/licenses/>.
#

# Generating routines for r <- a OP 2^(cnt) * b, where OP is either + or -.

r = "rp"
a = "ap"
b = "bp"
cnt = "cnt"
rp(ix::Int) = "$ix*8($r)"
ap(ix::Int) = "$ix*8($a)"
bp(ix::Int) = "$ix*8($b)"

tnc = "tnc"
sx = "sx" # Return value for carry or borrow, i.e. %rax

R32(sx::String) = "R32($sx)"
R8(sx::String) = "R8($sx)"

s0 = "s0"
s1 = "s1"
s2 = "s2"
s3 = "s3"
sp = ["s$ix" for ix in 0:3] # Scrap registers
s(ix::Int) = s[ix + 1]

# Writes assembly that should be preprocessed by M4.
function aorsrsh(n::Int; is_add::Bool = true)
str = "\tALIGN(16)\nPROLOGUE(flint_mpn_$(is_add ? "add" : "sub")rsh_$n)\n"
function push(s0::String)
str *= "\tpush\t$s0\n"
end
function pop(s0::String)
str *= "\tpop\t$s0\n"
end
function mov(s0::String, s1::String)
str *= "\tmov\t$s0, $s1\n"
end
function xor(s0::String, s1::String)
str *= "\txor\t$s0, $s1\n"
end
function add(s0::String, s1::String)
str *= "\tadd\t$s0, $s1\n"
end
function adc(s0::String, s1::String)
str *= "\tadc\t$s0, $s1\n"
end
function sub(s0::String, s1::String)
str *= "\tsub\t$s0, $s1\n"
end
function sbb(s0::String, s1::String)
str *= "\tsbb\t$s0, $s1\n"
end
function shrx(s0::String, s1::String, s2::String)
str *= "\tshrx\t$s0, $s1, $s2\n"
end
function shlx(s0::String, s1::String, s2::String)
str *= "\tshlx\t$s0, $s1, $s2\n"
end
function lea(t::Tuple{String, String}, s1::String)
str *= "\tlea\t($(t[1]), $(t[2])), $s1\n"
end
function setc(s0::String)
str *= "\tsetc\t$s0\n"
end

# Initialize variables
if !is_add
push( s3)
end
xor( tnc, tnc) # We do not use 32 bit mode here since tnc = %r8.
sub( cnt, tnc) # This is modulo 64, so -n = 64 - n.
xor( R32(sx), R32(sx))

# f_a assumes s1 contains ix*8(bp)
function f_a(ix::Int)
if ix == 0
shrx( cnt, bp(0), s0)
mov( bp(ix + 1), s1)
elseif ix == n - 1
shrx( cnt, s1, s1)
else
shrx( cnt, s1, s0)
mov( bp(ix + 1), s1)
end
end # s0, s1 used
function f_b(ix::Int)
if ix != n - 1
shlx( tnc, s1, s2)
lea( (s0, s2), s2)
end
end # s1, s2 used
function f_c(ix::Int)
if is_add
if ix == 0
add( ap(ix), s2)
mov( s2, rp(ix))
elseif ix == n - 1
adc( ap(ix), s1)
mov( s1, rp(ix))
else
adc( ap(ix), s2)
mov( s2, rp(ix))
end
else
# Due to the lack of an `rsub' instruction, we need an extra
# register.
if ix == 0
mov( ap(ix), s3)
sub( s2, s3)
mov( s3, rp(ix))
elseif ix == n - 1
mov( ap(ix), s0)
sbb( s1, s0)
mov( s0, rp(ix))
else
mov( ap(ix), s3)
sbb( s2, s3)
mov( s3, rp(ix))
end
end
end # nothing used

# We interleave as follows:
f_a(0)
f_b(0)
for ix in 1:(n - 1)
f_a(ix + 0)
f_c(ix - 1)
f_b(ix + 0)
end
f_c(n - 1)

if !is_add
pop( s3)
end
setc( R8(sx))

str *= "\tret\nEPILOGUE()\n"

return str
end

function print_all_aorsrsh(nmax::Int = 16)
for n in 2:nmax
println(aorsrsh(n, is_add = true))
end
for n in 2:nmax
println(aorsrsh(n, is_add = false))
end
end
Loading
Loading