diff --git a/Makefile.in b/Makefile.in index 0f25aa2b56..7ac241a637 100644 --- a/Makefile.in +++ b/Makefile.in @@ -193,7 +193,8 @@ HEADER_DIRS := \ fmpz_mod_mpoly_factor fmpq_mpoly_factor \ fq_nmod_mpoly_factor fq_zech_mpoly_factor \ \ - fft @FFT_SMALL@ fmpz_poly_q fmpz_lll \ + fft n_fft @FFT_SMALL@ \ + fmpz_poly_q fmpz_lll \ n_poly arith qsieve aprcl \ \ nf nf_elem qfb \ diff --git a/src/n_fft.h b/src/n_fft.h new file mode 100644 index 0000000000..0df2674ff1 --- /dev/null +++ b/src/n_fft.h @@ -0,0 +1,234 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#ifndef N_FFT_H +#define N_FFT_H + +#include "flint.h" + +#define N_FFT_CTX_DEFAULT_DEPTH 12 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * TODO[long term] large depth can lead to heavy memory usage + * --> provide precomputation-free functions + * + * TODO[long term] avx512 vectorization + * + * TODO[long term] on zen4 (likely on other cpus as well) ctx_init becomes + * slower at some point, losing a factor 4 or more; this is expected due to + * memory aspects but arises rather early, in fact the depth where it becomes + * slower is significantly smaller (~13-14) when tab_iw has been incorporated + * compared to without tab_iw (it was depth ~20-21); see if this can be + * understood, and maybe play with vectorization for those simple functions + */ + + +/*-------------------------------------------------*/ +/* STRUCTURES FOR FFT CONTEXT / FUNCTION ARGUMENTS */ +/*-------------------------------------------------*/ + + +/** n_fft context: + * - basic parameters + * - precomputed powers of the primitive root of unity and its inverse + * - precomputed inverses of 2**k + * + * Requirements (not checked upon init): + * - mod is an odd prime < 2**(FLINT_BITS-2) + * - max_depth must be >= 3 (so, 8 must divide mod - 1) + * Total memory cost of precomputations for arrays tab_{w,iw,w2,inv2}: + * at most 2 * (2*FLINT_BITS + 2**depth) ulong's + * + * For more details about the content of tab_{w,iw,w2,inv2}, see comments below + **/ +typedef struct +{ + ulong mod; // modulus, odd prime + ulong max_depth; // maximum supported depth (w has order 2**max_depth) + ulong cofactor; // prime is 1 + cofactor * 2**max_depth + ulong depth; // depth supported by current precomputation + nn_ptr tab_w; // precomputed powers of w + nn_ptr tab_iw; // precomputed powers of 1/w + ulong tab_w2[2*FLINT_BITS]; // precomputed powers w**(2**k) + ulong tab_inv2[2*FLINT_BITS]; // precomputed inverses of 2**k +} n_fft_ctx_struct; +typedef n_fft_ctx_struct n_fft_ctx_t[1]; + + +/** n_fft arguments: + * - modulus mod + * - its double 2*mod (storing helps for speed) + * - precomputed powers of w + * To be used as an argument in FFT functions. In some parts, providing this + * instead of the whole context increased performance. Also, this facilitate + * using the same function with both tab_w and tab_iw (by forming an fft_args + * with Fargs->tab_w = F->tab_iw. + **/ +typedef struct +{ + ulong mod; // modulus, odd prime + ulong mod2; // 2*mod + nn_srcptr tab_w; // tabulated powers of w, see below +} n_fft_args_struct; +typedef n_fft_args_struct n_fft_args_t[1]; + + +/** tab_w2: + * - length 2*FLINT_BITS, with undefined entries at index 2*(max_depth-1) and beyond + * - contains powers w**d for d a power of 2, and corresponding + * precomputations for modular multiplication: + * -- for 0 <= k < max_depth-1, tab_w2[2*k] = w**(2**(max_depth-2-k)) + * and tab_w2[2*k+1] = floor(tab_w2[2*k] * 2**FLINT_BITS / mod) + * -- for 2*(max_depth-1) <= k < 2*FLINT_BITS, tab_w2[k] is undefined + * + * --> one can retrieve w as tab_w2[2 * (max_depth-2)] + * --> the first elements are tab_w2 = [I, I_pr, J, J_pr, ...] + * where I is a square root of -1 and J is a square root of I + */ + +/** tab_w: + * - length 2**depth + * - contains 2**(depth-1) first powers of w in (max_depth-1)-bit reversed order, + * and corresponding precomputations for modular multiplication: + * -- for 0 <= k < 2**(depth-1), tab_w[2*k] = w**(br[k]) + * and tab_w[2*k+1] = floor(tab_w[2*k] * 2**FLINT_BITS / mod) + * where br = [0, 2**(max_depth-2), 2**(max_depth-3), 3 * 2**(max_depth-3), ...] + * is the bit reversal permutation of length 2**(max_depth-1) + * (https://en.wikipedia.org/wiki/Bit-reversal_permutation) + * + * In particular the first elements are + * tab_w = [1, 1_pr, I, I_pr, J, J_pr, IJ, IJ_pr, ...] + * where I is a square root of -1, J is a square root of I, and IJ = I*J. Note + * that powers of w beyond 2**(max_depth-1), for example -1, -I, -J, etc. are + * not stored. + **/ + +/** tab_iw: same as tab_w but for the primitive root 1/w */ + +/** tab_inv2: + * - length 2*FLINT_BITS, with undefined entries at index 2*max_depth and beyond + * - contains the modular inverses of 2**k, and corresponding + * precomputations for modular multiplication: + * -- for 0 <= k < max_depth, tab_inv2[2*k] = the inverse of 2**(k+1) + * modulo mod, and tab_inv2[2*k+1] = floor(tab_inv2[2*k] * 2**FLINT_BITS / mod) + * -- for 2*max_depth <= k < 2*FLINT_BITS, tab_inv2[k] is undefined + * + * Recall F->mod == 1 + cofactor * 2**max_depth, so + * 1 == F->mod - cofactor * 2**(max_depth - k) * 2**k + * --> the inverse of 2**k in (0, F->mod) is + * F->mod - cofactor * 2**(max_depth - k), + * we do not really need to store it, but we want the precomputations as well + */ + + +/*------------------------------------------*/ +/* PRECOMPUTATIONS / CONTEXT INITIALIZATION */ +/*------------------------------------------*/ + +/** Note for init functions, when depth is provided: + * - if it is < 3, it is pretended that it is 3 + * - it it is more than F->max_depth (the maximum possible with the given + * prime), it is reduced to F->max_depth + * After calling init, precomputations support DFTs of length up to 2**depth + */ + +/* initialize with given root and given depth */ +void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong cofactor, ulong depth, ulong mod); + +/* find primitive root, initialize with given depth */ +void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p); + +/* same, with default depth */ +FLINT_FORCE_INLINE +void n_fft_ctx_init_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong cofactor, ulong p) +{ n_fft_ctx_init2_root(F, w, max_depth, cofactor, N_FFT_CTX_DEFAULT_DEPTH, p); } + +FLINT_FORCE_INLINE +void n_fft_ctx_init(n_fft_ctx_t F, ulong p) +{ n_fft_ctx_init2(F, N_FFT_CTX_DEFAULT_DEPTH, p); } + +/* grows F->depth and precomputations to support DFTs of depth up to depth */ +void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth); + +void n_fft_ctx_clear(n_fft_ctx_t F); + +FLINT_FORCE_INLINE +void n_fft_set_args(n_fft_args_t F, ulong mod, nn_srcptr tab_w) +{ + F->mod = mod; + F->mod2 = 2*mod; + F->tab_w = tab_w; +} + +/*-----------------------------*/ +/* DFT / IDFT / DFT_t / IDFT_t */ +/*-----------------------------*/ + +/** forward and inverse transforms, and their transposes: + * - length is a power of 2, len == 2**depth + * - requirement of all functions (not checked): depth <= F.depth + * - the comments below describe algorithms that modify the input array p in + * place: in these comments p stands for the input p, whereas q stands + * for the array p after running the algorithm + * - below in comments we write w[k] for 0 <= k < len/2, defined as + * w[2*k] == F->tab_w[2*k] + * w[2*k+1] == - F->tab_w[2*k] + * - hence the list w[k] for 0 <= k < len gives the len roots of the + * polynomial x**len - 1, which are all powers of the chosen len-th + * primitive root of unity, with exponents listed in bit reversed order + * - the matrix of DFT of length len is the len x len matrix + * DFT_{w,len} = [ w[i]**j ]_{0 <= i, j < len} + */ + +/** dft: discrete Fourier transform (q = DFT_{w,len} * p) + * In-place transform p = [p[j] for 0 <= j < len], seen as a polynomial p(x) of + * degree < len, into its evaluations + * q == [p(w[i]) for 0 <= i < len], + * where p(w[i]) = sum(p[j] * w[i]**j for 0 <= j < len) + */ + +/** idft: inverse discrete Fourier transform (q = DFT_{w,len}^{-1} * p) + * In-place transform p = [p[i] for 0 <= i < len] into the list of coefficients + * q = [q[j] for 0 <= j < len] of the unique polynomial q(x) of degree < len + * such that p[i] == q(w[i]) for 0 <= i < len + */ + +/** dft_t: transposed discrete Fourier transform (q = p * DFT_{w,len}) + * In-place transform p = [p[i] for 0 <= i < len] into the list of weighted + * power sums + * q == [PowerSum(p, w**j) for 0 <= j < len] + * where PowerSum(p, w**j) == sum(p[i] * w[i]**j for 0 <= i < len) + */ + +/** idft_t: transposed inverse discrete Fourier transform (q = p * DFT_{w,len}^{-1}) + * In-place transform p = [p[j] for 0 <= j < len] into the coefficients q = + * [q[i] for 0 <= i < len] which appear in the partial fraction decomposition + * p(x) = sum_{0 <= i < len} q[i] / (1 - w[i] * x) + O(x**len) + * where p(x) is the power series p(x) = sum_{0 <= j < len} p[j] x**j + O(x**len) + */ + +void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F); + +void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F); + +void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); + +void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); + +#ifdef __cplusplus +} +#endif + +#endif /* N_FFT_H */ diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c new file mode 100644 index 0000000000..faba87e3da --- /dev/null +++ b/src/n_fft/ctx_init.c @@ -0,0 +1,175 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "n_fft.h" +#include "ulong_extras.h" /* for mulmod_shoup* functions */ + +/** Given the precomputed quotient a_pr for modular multiplication by a mod n, + * a_pr == floor(a * 2**FLINT_BITS / n) + * where we assume 0 < a < n and n does not divide a * 2**FLINT_BITS, + * this returns the quotient for mulmod by -a mod n, + * floor( (n-a) * 2**FLINT_BITS / n) + * == 2**FLINT_BITS - ceil(a * 2**FLINT_BITS / n) + * == 2**FLINT_BITS - a_pr + * + * Note: the requirement "n does not divide a * 2**FLINT_BITS" follows + * from the other requirement 0 < a < n as soon as n is odd; in n_fft.h + * we will only use this for odd primes + */ +FLINT_FORCE_INLINE ulong n_mulmod_precomp_shoup_negate(ulong a_pr) +{ + return UWORD_MAX - a_pr; +} + +void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong cofactor, ulong depth, ulong p) +{ + if (depth < 3) + depth = 3; + if (max_depth < depth) + depth = max_depth; + + // fill basic attributes + F->mod = p; + F->max_depth = max_depth; + F->cofactor = cofactor; + F->depth = 3; // to be able to call fit_depth below + + // fill tab_w2 + ulong pr_quo, pr_rem, ww; + ww = w; + n_mulmod_precomp_shoup_quo_rem(&pr_quo, &pr_rem, ww, p); + F->tab_w2[2*(max_depth-2)] = ww; + F->tab_w2[2*(max_depth-2)+1] = pr_quo; + for (slong k = max_depth-3; k >= 0; k--) + { + // ww <- ww**2 and its precomputed quotient + n_mulmod_and_precomp_shoup(&ww, &pr_quo, ww, ww, pr_quo, pr_rem, pr_quo, p); + pr_rem = n_mulmod_precomp_shoup_rem_from_quo(pr_quo, p); + F->tab_w2[2*k] = ww; + F->tab_w2[2*k+1] = pr_quo; + } + // at this stage, pr_quo and pr_rem are for k == 0 i.e. for I == tab_w2[0] + + // fill tab_inv2 + for (ulong k = 0; k < max_depth; k++) + { + F->tab_inv2[2*k] = p - (cofactor << (max_depth - k-1)); + F->tab_inv2[2*k+1] = n_mulmod_precomp_shoup(F->tab_inv2[2*k], p); + } + + // fill tab_w and tab_iw for depth 3 + ulong len = UWORD(1) << (depth-1); // len >= 4 + F->tab_w = (nn_ptr) flint_malloc(2*len * sizeof(ulong)); + F->tab_iw = (nn_ptr) flint_malloc(2*len * sizeof(ulong)); + + // w**0 == iw**0 == 1 + F->tab_w[0] = UWORD(1); + F->tab_w[1] = n_mulmod_precomp_shoup(UWORD(1), p); + F->tab_iw[0] = UWORD(1); + F->tab_iw[1] = F->tab_w[1]; + + // w**(L/4) == I and iw**(L/4) == -I, L == 2**max_depth + F->tab_w[2] = F->tab_w2[0]; + F->tab_w[3] = F->tab_w2[1]; + F->tab_iw[2] = p - F->tab_w2[0]; + F->tab_iw[3] = n_mulmod_precomp_shoup_negate(F->tab_w2[1]); + + // w**(L/8) == J and w**(3L/8) == I*J + F->tab_w[4] = F->tab_w2[2]; + F->tab_w[5] = F->tab_w2[3]; + n_mulmod_and_precomp_shoup(F->tab_w+6, F->tab_w+7, F->tab_w2[0], F->tab_w2[2], pr_quo, pr_rem, F->tab_w2[3], p); + + // iw**(L/8) == -I*J and iw**(3L/8) == -J + F->tab_iw[4] = p - F->tab_w[6]; + F->tab_iw[5] = n_mulmod_precomp_shoup_negate(F->tab_w[7]); + F->tab_iw[6] = p - F->tab_w[4]; + F->tab_iw[7] = n_mulmod_precomp_shoup_negate(F->tab_w[5]); + + // complete tab_w up to specified depth + n_fft_ctx_fit_depth(F, depth); +} + +void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p) +{ + FLINT_ASSERT(p > 2 && flint_clz(p) >= 2); // 2 < p < 2**(FLINT_BITS-2) + FLINT_ASSERT(flint_ctz(p - UWORD(1)) >= 3); // p-1 divisible by 8 + + // find the constant and exponent such that p == c * 2**max_depth + 1 + const ulong max_depth = flint_ctz(p - UWORD(1)); + const ulong cofactor = (p - UWORD(1)) >> max_depth; + + // find primitive root w of order 2**max_depth + const ulong prim_root = n_primitive_root_prime(p); + const ulong w = n_powmod2(prim_root, cofactor, p); + + // fill all attributes and tables + n_fft_ctx_init2_root(F, w, max_depth, cofactor, depth, p); +} + +void n_fft_ctx_clear(n_fft_ctx_t F) +{ + flint_free(F->tab_w); + flint_free(F->tab_iw); +} + +void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth) +{ + if (F->max_depth < depth) + depth = F->max_depth; + + if (depth > F->depth) + { + ulong len = UWORD(1) << (depth-1); // len >= 8 (since depth >= 4) + F->tab_w = flint_realloc(F->tab_w, 2*len * sizeof(ulong)); + F->tab_iw = flint_realloc(F->tab_iw, 2*len * sizeof(ulong)); + + // tab_w[2] is w**(L/8) * tab_w[0], where L = 2**max_depth, + // tab_w[2*4,2*6] is w**(L/16) * tab_w[2*0,2*2], + // tab_w[2*8,2*10,2*12,2*14] is w**(L/32) * tab_w[2*0,2*2,2*4,2*6], etc. + // recall tab_w2[2*k] == w**(L / 2**(k+2)) + ulong d = F->depth - 1; + ulong llen = UWORD(1) << (F->depth-1); + ulong ww, pr_quo, pr_rem; + for ( ; llen < len; llen <<= 1, d += 1) + { + ww = F->tab_w2[2*d]; + pr_quo = F->tab_w2[2*d+1]; + pr_rem = n_mulmod_precomp_shoup_rem_from_quo(pr_quo, F->mod); + // for each k, tab_w[2*(k+llen)] <- ww * tab_w[2*k], and deduce precomputation + for (ulong k = 0; k < llen; k+=4) + { + n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+0), F->tab_w + 2*llen + 2*(k+0)+1, + ww, F->tab_w[2*(k+0)], + pr_quo, pr_rem, F->tab_w[2*(k+0)+1], F->mod); + n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+1), F->tab_w + 2*llen + 2*(k+1)+1, + ww, F->tab_w[2*(k+1)], + pr_quo, pr_rem, F->tab_w[2*(k+1)+1], F->mod); + n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+2), F->tab_w + 2*llen + 2*(k+2)+1, + ww, F->tab_w[2*(k+2)], + pr_quo, pr_rem, F->tab_w[2*(k+2)+1], F->mod); + n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+3), F->tab_w + 2*llen + 2*(k+3)+1, + ww, F->tab_w[2*(k+3)], + pr_quo, pr_rem, F->tab_w[2*(k+3)+1], F->mod); + + F->tab_iw[2*llen + 2*(llen-1-(k+0))] = F->mod - F->tab_w[2*llen + 2*(k+0)]; + F->tab_iw[2*llen + 2*(llen-1-(k+0)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+0)+1]); + F->tab_iw[2*llen + 2*(llen-1-(k+1))] = F->mod - F->tab_w[2*llen + 2*(k+1)]; + F->tab_iw[2*llen + 2*(llen-1-(k+1)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+1)+1]); + F->tab_iw[2*llen + 2*(llen-1-(k+2))] = F->mod - F->tab_w[2*llen + 2*(k+2)]; + F->tab_iw[2*llen + 2*(llen-1-(k+2)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+2)+1]); + F->tab_iw[2*llen + 2*(llen-1-(k+3))] = F->mod - F->tab_w[2*llen + 2*(k+3)]; + F->tab_iw[2*llen + 2*(llen-1-(k+3)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+3)+1]); + } + } + + F->depth = depth; + } +} diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c new file mode 100644 index 0000000000..772edbd3b0 --- /dev/null +++ b/src/n_fft/dft.c @@ -0,0 +1,295 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "n_fft.h" +#include "n_fft_macros.h" + +/** Structure. + * - The main interface is n_fft_dft, it solves the problem at node 0 + * (evaluating at all roots of unity of order 2**depth), as documented + * in n_fft.h. + * - The core function is `dft_node_lazy_4_4`, which goes down the subproduct + * tree from an arbitrary node in this tree; it takes input values in [0..4n) + * and return values in [0..4n), following the idea of lazy butterflies + * highlighted by David Harvey [Faster arithmetic for number-theoretic + * transforms, Journal of Symbolic Computation, Volume 60, 2014, pp 113-119]. + * - This core function costs more than a DFT at node 0, at least for small or + * smallish lengths. So a specific function for node 0 is given + * (`dft_lazy_1_4`), targeting input values in [0..n) and return values in + * [0..4n) (it iself uses a similar function `dft_lazy_2_4`). The main + * function `n_fft_dft` just calls `dft_lazy_1_4` and then reduces the output + * to [0..n). + */ + +/** Example for nodes/depth: + * if F.depth is 3, the tree of roots of unity in F->tab_w is + * 1 d3n0 <-- depth 3 + * / \ / \ + * 1 -1 d2n0 d2n1 <-- depth 2 + * / \ / \ = / \ / \ + * 1 -1 I -I d1n0 d1n1 d1n2 d1n3 <-- depth 1 + * / \ / \ / \ / \ / \ / \ / \ / \ + * 1 -1 I -I J -J IJ -IJ 1 -1 I -I J -J IJ -IJ <-- depth 0 + * stored as, ommitting precomputations: + * F->tab_w == [1, 1_pr, I, I_pr, J, J_pr, IJ, IJ_pr] + * (the elements -1, -I, -J, -IJ are not stored) + * + * + * -> calling a function with depth==3 and node==0 is performing + * evaluation at all these 8 points (8th roots of 1) + * -> calling a function with depth==2 and node==0 is performing + * evaluation at all points at the leaves of the left child d2n0 + * of the root of the tree d3n0 (4th roots of 1) + * -> calling a function with depth==2 and node==1 is performing + * evaluation at all points at the leaves of the right child d2n1 + * of d3n0 (4th roots of -1) + * -> calling a function with depth==1 and node==1 is performing + * evaluation at all points at the leaves of the subtree rooted + * at d1n1 (square roots of -1) + * -> calling a function with depth==1 and node==2 is performing + * evaluation at all points at the leaves of the subtree rooted + * at d1n2 (square roots of I) + */ + +/*-----------------------*/ +/* auxiliary functions */ +/*-----------------------*/ + +/** 2**depth-point DFT, general node + * * In-place transform p of length len == 2**depth, seen as a polynomial of + * degree < len, into the concatenation of all polynomial evaluations + * [p(w_k), p(-w_k)] for k in range(len), + * where w_k = F->tab_w[2**depth * node + 2*k] for 0 <= k < 2**(depth-1) + * * By construction these evaluation points are the len roots of the + * polynomial x**len - F->tab_w[2*node] (for example, if depth= + * * Requirements (not checked): + * 3 <= depth + * (node+1) * 2**depth < 2**F.depth (length of F->tab_w) + * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n + */ +void dft_node_lazy_4_4(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) +{ + if (depth == 3) + { + DFT8_NODE_LAZY_4_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], node, F->mod, F->mod2, F->tab_w); + } + else if (depth == 4) + { + DFT16_NODE_LAZY_4_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + node, F->mod, F->mod2, F->tab_w); + } + else if (depth == 5) + { + DFT32_NODE_LAZY_4_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], + p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], + node, F->mod, F->mod2, F->tab_w); + } + else + { + const ulong len = UWORD(1) << depth; + + // 4-point butterflies + // in: [0..4n), out: [0..4n) + const nn_ptr p0 = p; + const nn_ptr p1 = p+len/4; + const nn_ptr p2 = p+2*len/4; + const nn_ptr p3 = p+3*len/4; + const ulong w2 = F->tab_w[2*node]; + const ulong w2pre = F->tab_w[2*node+1]; + const ulong w = F->tab_w[4*node]; + const ulong wpre = F->tab_w[4*node+1]; + const ulong Iw = F->tab_w[4*node+2]; + const ulong Iwpre = F->tab_w[4*node+3]; + + for (ulong k = 0; k < len/4; k+=4) + { + DFT4_NODE_LAZY_4_4(p0[k+0], p1[k+0], p2[k+0], p3[k+0], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2); + DFT4_NODE_LAZY_4_4(p0[k+1], p1[k+1], p2[k+1], p3[k+1], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2); + DFT4_NODE_LAZY_4_4(p0[k+2], p1[k+2], p2[k+2], p3[k+2], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2); + DFT4_NODE_LAZY_4_4(p0[k+3], p1[k+3], p2[k+3], p3[k+3], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2); + } + + // 4 recursive calls with depth-2 + dft_node_lazy_4_4(p0, depth-2, 4*node, F); + dft_node_lazy_4_4(p1, depth-2, 4*node+1, F); + dft_node_lazy_4_4(p2, depth-2, 4*node+2, F); + dft_node_lazy_4_4(p3, depth-2, 4*node+3, F); + } +} + +/** 2**depth-point DFT + * Same specification as n_fft_dft, except for: + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + * requirement (not checked): depth <= F.depth + * * lazy_2_4: in [0..2n) / out [0..4n) / max < 4n + * requirement (not checked): 3 <= depth <= F.depth + */ +void dft_lazy_2_4(nn_ptr p, ulong depth, n_fft_args_t F) +{ + if (depth == 3) + { + DFT8_LAZY_2_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w); + } + else if (depth == 4) + { + DFT16_LAZY_2_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + F->mod, F->mod2, F->tab_w); + } + else if (depth == 5) + { + DFT32_LAZY_2_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], + p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], + F->mod, F->mod2, F->tab_w); + } + else + { + const ulong len = UWORD(1) << depth; + + // 4-point butterflies + // input p0,p1,p2,p3 in [0..2n) x [0..2n) x [0..2n) x [0..2n) + // output p0,p1,p2,p3 in [0..2n) x [0..4n) x [0..4n) x [0..4n) + const nn_ptr p0 = p; + const nn_ptr p1 = p + len/4; + const nn_ptr p2 = p + 2*len/4; + const nn_ptr p3 = p + 3*len/4; + for (ulong k = 0; k < len/4; k++) + { + DFT4_LAZY_2_4(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); + if (p0[k] >= F->mod2) + p0[k] -= F->mod2; + } + + // 4 recursive calls with depth-2 + dft_lazy_2_4(p0, depth-2, F); + dft_node_lazy_4_4(p1, depth-2, 1, F); + dft_node_lazy_4_4(p2, depth-2, 2, F); + dft_node_lazy_4_4(p3, depth-2, 3, F); + } +} + +void dft_lazy_1_4(nn_ptr p, ulong depth, n_fft_args_t F) +{ + if (depth == 4) + { + DFT16_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + F->mod, F->mod2, F->tab_w); + } + else if (depth == 5) + { + DFT32_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], + p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], + F->mod, F->mod2, F->tab_w); + } + else if (depth > 5) + { + const ulong len = UWORD(1) << depth; + + // 4-point butterflies + // input p0,p1,p2,p3 in [0..n) x [0..n) x [0..n) x [0..n) + // output p0,p1,p2,p3 in [0..2n) x [0..4n) x [0..4n) x [0..4n) + const nn_ptr p0 = p; + const nn_ptr p1 = p + len/4; + const nn_ptr p2 = p + 2*len/4; + const nn_ptr p3 = p + 3*len/4; + for (ulong k = 0; k < len/4; k++) + { + DFT4_LAZY_1_4(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); + if (p0[k] >= F->mod2) + p0[k] -= F->mod2; + } + + // 4 recursive calls with depth-2 + dft_lazy_2_4(p0, depth-2, F); + dft_node_lazy_4_4(p1, depth-2, 1, F); + dft_node_lazy_4_4(p2, depth-2, 2, F); + dft_node_lazy_4_4(p3, depth-2, 3, F); + } + else if (depth == 3) + { + DFT8_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w); + } + else if (depth == 2) + { + DFT4_LAZY_1_4(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); + } + else if (depth == 1) + { + DFT2_LAZY_1_2(p[0], p[1], F->mod); + } +} + +/*-------------------*/ +/* main interfaces */ +/*-------------------*/ + +void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) +{ + if (depth > 0) + { + n_fft_args_t Fargs; + n_fft_set_args(Fargs, F->mod, F->tab_w); + dft_lazy_1_4(p, depth, Fargs); + for (ulong k = 0; k < (UWORD(1) << depth); k++) + { + if (p[k] >= Fargs->mod2) + p[k] -= Fargs->mod2; + if (p[k] >= Fargs->mod) + p[k] -= Fargs->mod; + } + } +} + +void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F) +{ + if (depth > 0) + { + n_fft_args_t Fargs; + n_fft_set_args(Fargs, F->mod, F->tab_iw); + dft_lazy_1_4(p, depth, Fargs); + + // see comments in idft concerning this loop + const ulong inv2 = F->tab_inv2[2*depth-2]; + const ulong inv2_pr = F->tab_inv2[2*depth-1]; + for (ulong k = 0; k < (UWORD(1) << depth); k++) + p[k] = n_mulmod_shoup(inv2, p[k], inv2_pr, F->mod); + } +} + +/*---------------*/ +/* some comments */ +/*---------------*/ + +/** In n_fft_idft_t, there is apparently no gain from using the lazy + * mulmod_shoup variant whose output is in [0..2n) (so one may as well use the + * non-lazy one which ensures output < n) + */ + +/** Lazier variants for DFT with general node: + * - lazy_1_4 variants would be basically identical to the lazy_2_4 variants (see the macros) + * - writing lazy_2_4 variants of the DFTxx_NODE_LAZY_4_4 macros and then of + * dft_node_lazy_4_4 brings almost no speedup (very marginal gain up to length + * 32 or 64, nothing observable beyond this) + */ + +/** Base cases: + * - having macros for "small" lengths (up to 16 or 32 at least) improves performance + * - removing the base cases depth==3 in internal functions where this case is + * not really used (eg dft_node_lazy_4_4) does not make a difference + */ diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c new file mode 100644 index 0000000000..f5c503686c --- /dev/null +++ b/src/n_fft/idft.c @@ -0,0 +1,212 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "n_fft.h" +#include "n_fft_macros.h" + +/** Structure. + * - The main interface is n_fft_idft, it solves the problem at node 0 + * (interpolating at all roots of unity of order 2**depth), as documented in + * n_fft.h. + * - The core function is `idft_node_lazy_1_2`, which goes up the subproduct + * tree towards an arbitrary node in this tree; it takes input values in + * [0..n) and return values in [0..2n), following the idea of lazy + * butterflies highlighted by David Harvey [Faster arithmetic for + * number-theoretic transforms, Journal of Symbolic Computation, Volume 60, + * 2014, pp 113-119]. This function does not scale the output by the inverse + * of 2**depth. + * - This core function costs more than a iDFT at node 0, at least for small or + * smallish lengths. So a specific function for node 0 is given + * (`idft_lazy_1_4`), targeting input values in [0..n) and return values in + * [0..4n). The main function `n_fft_idft` just calls `idft_lazy_1_4`, and + * then scales the output value by the inverse of 2**depth, also ensuring the + * output is in [0..n). + */ + +/************************* +* auxiliary functions * +*************************/ + +/** 2**depth-point inverse DFT, general node + * * In-place transform p = [p[i] for 0 <= i < len], where len == 2**depth, + * into the list of coefficients q = [q[j] for 0 <= j < len] of the unique + * polynomial q(x) of degree < len such that p[i] == q(w[i]) for 0 <= i < len + * * Here we write w[k] for 0 <= k < len/2, defined as + * w[2*k] == F->tab_w[2**depth * node + 2*k] + * w[2*k+1] == - F->tab_w[2**depth * node + 2*k]; + * these are the len roots of the polynomial x**len - F->tab_w[2*node] + * * Requirements (not checked): + * 3 <= depth + * (node+1) * 2**depth < 2**F.depth (length of F->tab_w) + * * lazy_1_2: in [0..n) / out [0..2n) / max < 4n + */ +void idft_node_lazy_1_2(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) +{ + if (depth == 3) + { + IDFT8_NODE_LAZY_1_2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + node, F->mod, F->mod2, F->tab_w); + } + else if (depth == 4) + { + IDFT16_NODE_LAZY_1_2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + node, F->mod, F->mod2, F->tab_w); + } + else if (depth == 5) + { + IDFT32_NODE_LAZY_1_2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], + p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], + node, F->mod, F->mod2, F->tab_w); + } + else + { + const ulong len = UWORD(1) << depth; + + // 4 recursive calls with depth-2 + const nn_ptr p0 = p; + const nn_ptr p1 = p + len/4; + const nn_ptr p2 = p + 2*len/4; + const nn_ptr p3 = p + 3*len/4; + idft_node_lazy_1_2(p0, depth-2, 4*node, F); + idft_node_lazy_1_2(p1, depth-2, 4*node+1, F); + idft_node_lazy_1_2(p2, depth-2, 4*node+2, F); + idft_node_lazy_1_2(p3, depth-2, 4*node+3, F); + + const ulong w2 = F->tab_w[2*node]; + const ulong w2_pr = F->tab_w[2*node+1]; + const ulong w = F->tab_w[4*node]; + const ulong w_pr = F->tab_w[4*node+1]; + const ulong Iw = F->tab_w[4*node+2]; + const ulong Iw_pr = F->tab_w[4*node+3]; + + for (ulong k = 0; k < len/4; k+=4) + { + IDFT4_NODE_LAZY_2_2(p0[k+0], p1[k+0], p2[k+0], p3[k+0], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2); + IDFT4_NODE_LAZY_2_2(p0[k+1], p1[k+1], p2[k+1], p3[k+1], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2); + IDFT4_NODE_LAZY_2_2(p0[k+2], p1[k+2], p2[k+2], p3[k+2], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2); + IDFT4_NODE_LAZY_2_2(p0[k+3], p1[k+3], p2[k+3], p3[k+3], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2); + } + } +} + +/** 2**depth-point inverse DFT + * Same specification as n_fft_idft, except that the + * output values are in [0..4n) + */ +void idft_lazy_1_4(nn_ptr p, ulong depth, n_fft_args_t F) +{ + if (depth == 0) + return; + + if (depth == 1) + { + DFT2_LAZY_1_2(p[0], p[1], F->mod); + } + else if (depth == 2) + { + IDFT4_LAZY_1_4(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], + F->mod, F->mod2); + } + else + if (depth == 3) + { + IDFT8_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + F->mod, F->mod2, F->tab_w); + } + else if (depth == 4) + { + IDFT16_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + F->mod, F->mod2, F->tab_w); + } + else if (depth == 5) + { + IDFT32_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], + p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], + F->mod, F->mod2, F->tab_w); + } + else + { + const ulong len = UWORD(1) << depth; + + // 4 recursive calls with depth-2 + const nn_ptr p0 = p; + const nn_ptr p1 = p + len/4; + const nn_ptr p2 = p + 2*len/4; + const nn_ptr p3 = p + 3*len/4; + idft_lazy_1_4(p0, depth-2, F); + idft_node_lazy_1_2(p1, depth-2, 1, F); + idft_node_lazy_1_2(p2, depth-2, 2, F); + idft_node_lazy_1_2(p3, depth-2, 3, F); + + // 4-point butterflies + // input p0 in [0,4n), p1,p2,p3 in [0,2n) + // output p0,p1,p2,p3 in [0,4n) + for (ulong k = 0; k < len/4; k+=4) + { + IDFT4_LAZY_4222_4(p0[k+0], p1[k+0], p2[k+0], p3[k+0], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); + IDFT4_LAZY_4222_4(p0[k+1], p1[k+1], p2[k+1], p3[k+1], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); + IDFT4_LAZY_4222_4(p0[k+2], p1[k+2], p2[k+2], p3[k+2], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); + IDFT4_LAZY_4222_4(p0[k+3], p1[k+3], p2[k+3], p3[k+3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); + } + } +} + + +/*-------------------*/ +/* main interfaces */ +/*-------------------*/ + +void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F) +{ + if (depth > 0) + { + n_fft_args_t Fargs; + n_fft_set_args(Fargs, F->mod, F->tab_w); + idft_lazy_1_4(p, depth, Fargs); + for (ulong k = 0; k < (UWORD(1) << depth); k++) + { + if (p[k] >= Fargs->mod2) + p[k] -= Fargs->mod2; + if (p[k] >= Fargs->mod) + p[k] -= Fargs->mod; + } + } +} + +void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F) +{ + if (depth > 0) + { + n_fft_args_t Fargs; + n_fft_set_args(Fargs, F->mod, F->tab_iw); + idft_lazy_1_4(p, depth, Fargs); + + const ulong inv2 = F->tab_inv2[2*depth-2]; + const ulong inv2_pr = F->tab_inv2[2*depth-1]; + for (ulong k = 0; k < (UWORD(1) << depth); k++) + p[k] = n_mulmod_shoup(inv2, p[k], inv2_pr, F->mod); + } +} + +/*---------------*/ +/* some comments */ +/*---------------*/ + +/** In n_fft_idft, there is apparently no gain from using the lazy mulmod_shoup + * variant whose output is in [0..2n) (so one may as well use the non-lazy one + * which ensures output < n) + */ diff --git a/src/n_fft/n_fft_macros.h b/src/n_fft/n_fft_macros.h new file mode 100644 index 0000000000..c2b33e922b --- /dev/null +++ b/src/n_fft/n_fft_macros.h @@ -0,0 +1,997 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#ifndef N_FFT_MACROS_H +#define N_FFT_MACROS_H + +#include "longlong.h" /* for umul_ppmm */ +#include "ulong_extras.h" /* for mulmod_shoup* functions */ + +/*---------*/ +/* helpers */ +/*---------*/ + +/** Shoup's modular multiplication with precomputation, lazy + * (does not perform the excess correction step) + * --> computes either r or r+n and store it is res, where r = (a*b) % n + * --> a_pr is the precomputation for n, p_hi and p_lo are temporaries + */ +#define N_MULMOD_PRECOMP_LAZY(res, a, b, a_pr, n) \ +do { \ + ulong p_hi, p_lo; \ + umul_ppmm(p_hi, p_lo, (a_pr), (b)); \ + res = (a) * (b) - p_hi * (n); \ +} while(0) + +/*------------------*/ +/* length 2, node 0 */ +/*------------------*/ + +/** Butterfly radix 2 + * * In-place transform: [1 1] + * [a b] <- [a b] [1 -1] + * * n is the modulus, n2 is 2*n + * * lazy_1_2: in [0..n) / out [0..2n) / max < 2n + * * lazy_22_24: in [0..2n) x [0..2n) / out [0..2n) x [0..4n) / max < 4n + * * lazy_42_44: in [0..4n) x [0..2n) / out [0..4n) x [0..4n) / max < 4n + */ +#define DFT2_LAZY_1_2(a, b, n) \ +do { \ + ulong tmp; \ + tmp = (b); \ + (b) = (a) + (n) - tmp; \ + (a) = (a) + tmp; \ +} while(0) + +#define DFT2_LAZY_22_24(a, b, n2) \ +do { \ + ulong tmp; \ + tmp = (b); \ + (b) = (a) + (n2) - tmp; \ + (a) = (a) + tmp; \ + if ((a) >= (n2)) \ + (a) -= (n2); \ +} while(0) + +#define DFT2_LAZY_42_44(a, b, n2) \ +do { \ + ulong tmp; \ + tmp = (a); \ + if (tmp >= (n2)) \ + tmp -= (n2); /* [0..2n) */ \ + (a) = tmp + (b); /* [0..4n) */ \ + (b) = tmp + (n2) - (b); /* [0..4n) */ \ +} while(0) + +/*----------------------------------------------*/ +/* length 2, general node */ +/* (Cooley-Tukey & Gentleman-Sande butterflies) */ +/*----------------------------------------------*/ + +/** Cooley-Tukey butterfly: + * * In-place transform + * [1 1] + * [a b] <- [a b] [w -w] + * * n2 is 2*n, w_pr is the precomputed data for multiplication by w mod n + * * can be seen as evaluation at points w and -w of a+b*x + * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n + */ +#define DFT2_NODE_LAZY_4_4(a, b, w, w_pr, n, n2) \ +do { \ + ulong u, v; \ + u = (a); \ + if (u >= (n2)) \ + u -= (n2); /* [0..2n) */ \ + v = (b); \ + N_MULMOD_PRECOMP_LAZY(v, w, v, w_pr, n); \ + (a) = u + v; \ + (b) = u + (n2) - v; \ +} while(0) + +/** Gentleman-Sande butterfly: + * * In-place transform + * [1 w] + * [a b] <- [a b] [1 -w] + * * n2 is 2*n, w_pr is the precomputed data for multiplication by w mod n + * * can be seen as degree-1 interpolation at points iw = 1 / w and -iw, up to + * a scaling by 1/2, since the inverse of [1 w] is 1/2 * [ 1 1] + * [1 -w] [iw -iw] + * * lazy_22: in [0..2n) / out [0..2n) / max < 4n + */ +#define IDFT2_NODE_LAZY_2_2(a, b, w, w_pr, \ + n, n2) \ +do { \ + ulong tmp; \ + tmp = (a) + (n2) - (b); /* [0..4n) */ \ + (a) = (a) + (b); /* [0..4n) */ \ + if ((a) >= (n2)) \ + (a) -= (n2); /* [0..2n) */ \ + N_MULMOD_PRECOMP_LAZY((b), w, tmp, w_pr, n); \ + /* --> (b) in [0..2n) */ \ +} while(0) + +/*------------------*/ +/* length 4, node 0 */ +/*------------------*/ + +/** 4-point FFT evaluation + * * In-place transform + * [1 1 1 1] + * [1 -1 I -I] + * [a b c d] <- [a b c d] [1 1 -1 -1] + * [1 -1 -I I] + * [1 0 1 0] [1 1 0 0] + * == [a b c d] [0 1 0 I] [1 -1 0 0] + * [1 0 -1 0] [0 0 1 1] + * [0 1 0 -I] [0 0 1 -1] + * * Corresponds to reducing down the tree with nodes + * x^4 - 1 + * / \ + * x^2 - 1 x^2 + 1 + * / \ / \ + * x - 1 x + 1 x - I x + I + * where I is typically a square root of -1 + * (but this property is not exploited) + * * n is the modulus, n2 is 2*n + * I_pr is the precomputed data for multiplication by I mod n + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + * * lazy_2_4: in [0..2n) / out [0..4n) / max < 4n + */ +#define DFT4_LAZY_1_4(a, b, c, d, I, I_pr, n, n2) \ +do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v2; /* < 2*n */ \ + ulong v5 = v0 + (n) - v2; /* < 2*n */ \ + ulong v6 = v1 + v3; /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n) - v3, (I_pr), (n)); \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v4 + (n2) - v6; /* < 4*n */ \ + (c) = v5 + v7; /* < 4*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ +} while(0) + +#define DFT4_LAZY_2_4(a, b, c, d, I, I_pr, n, n2) \ +do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v2; /* < 4*n */ \ + if (v4 >= (n2)) \ + v4 -= (n2); /* < 2*n */ \ + ulong v5 = v0 + (n2) - v2; /* < 4*n */ \ + if (v5 >= (n2)) \ + v5 -= (n2); /* < 2*n */ \ + ulong v6 = v1 + v3; /* < 4*n */ \ + if (v6 >= (n2)) \ + v6 -= (n2); /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n2) - v3, (I_pr), (n)); \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v4 + (n2) - v6; /* < 4*n */ \ + (c) = v5 + v7; /* < 4*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ +} while(0) + +/** 4-point FFT interpolation + * * In-place transform + * [1 1 1 1] + * [1 -1 1 -1] + * [a b c d] <- [a b c d] [1 -I -1 I] + * [1 I -1 -I] + * [1 1 0 0] [1 0 1 0] + * == [a b c d] [1 -1 0 0] [0 1 0 1] + * [0 0 1 I] [1 0 -1 0] + * [0 0 1 -I] [0 1 0 -1] + * + * * If I**2 == -1, this matrix is the inverse of the one above; this + * corresponds to interpolation at 1, -1, I, -I, up to scaling by 1/4; or to + * going up the tree with nodes + * x^4 - 1 + * / \ + * x^2 - 1 x^2 + 1 + * / \ / \ + * x - 1 x + 1 x - I x + I + * * n is the modulus, n2 is 2*n + * I_pr is the precomputed data for multiplication by I mod n + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + * * lazy_4222_4: a in [0..4n), b,c,d in [0..2n) / out [0..4n) / max < 4n + */ +#define IDFT4_LAZY_1_4(a, b, c, d, I, I_pr, n, n2) \ +do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v1; /* < 2*n */ \ + ulong v5 = v0 + (n) - v1; /* < 2*n */ \ + ulong v6 = v2 + v3; /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n) - v3, (I_pr), (n)); \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v5 + v7; /* < 4*n */ \ + (c) = v4 + (n2) - v6; /* < 4*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ +} while(0) + +#define IDFT4_LAZY_4222_4(a, b, c, d, I, I_pr, n, n2) \ +do { \ + ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + if (v0 >= (n2)) \ + v0 -= (n2); /* < 2*n */ \ + ulong v4 = v0 + v1; /* < 4*n */ \ + if (v4 >= (n2)) \ + v4 -= (n2); /* < 2*n */ \ + ulong v5 = v0 + (n2) - v1; /* < 4*n */ \ + if (v5 >= (n2)) \ + v5 -= (n2); /* < 2*n */ \ + ulong v6 = v2 + v3; /* < 4*n */ \ + if (v6 >= (n2)) \ + v6 -= (n2); /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n2) - v3, (I_pr), (n)); \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v5 + v7; /* < 4*n */ \ + (c) = v4 + (n2) - v6; /* < 4*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ +} while(0) + +/*------------------------*/ +/* length 4, general node */ +/*------------------------*/ + +/** 4-point FFT, evaluation, from general node + * * In-place transform + * [ 1 1 1 1] + * [w2 -w2 w3 -w3] + * [a b c d] <- [a b c d] [w1 w1 -w1 -w1] + * [w1*w2 -w1*w2 -w1*w3 w1*w3] + * * Corresponds to reducing down the tree with nodes + * x^4 - w1**2 + * / \ + * x^2 - w1 x^2 + w1 + * / \ / \ + * x - w2 x + w2 x - w3 x + w3 + * typically w2**2 == w1 and w3 == I*w2 (hence w3**2 == -w1) so that the above + * is a Vandermonde matrix and this tree really is the subproduct tree built + * from the four roots w2, -w2, I*w2, -I*w2 of x**4 - w1 + * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n + */ +#define DFT4_NODE_LAZY_4_4(a, b, c, d, \ + w1, w1_pr, w2, w2_pr, w3, w3_pr, \ + n, n2) \ +do { \ + ulong tmp; \ + ulong u0 = (a); \ + ulong u1 = (b); \ + ulong u2 = (c); \ + ulong u3 = (d); \ + if (u0 >= n2) \ + u0 -= n2; \ + if (u1 >= n2) \ + u1 -= n2; \ + \ + N_MULMOD_PRECOMP_LAZY(u2, w1, u2, w1_pr, n); \ + tmp = u0; \ + u0 = u0 + u2; /* [0..4n) */ \ + u2 = tmp + n2 - u2; /* [0..4n) */ \ + if (u0 >= n2) \ + u0 -= n2; /* [0..2n) */ \ + if (u2 >= n2) \ + u2 -= n2; /* [0..2n) */ \ + \ + N_MULMOD_PRECOMP_LAZY(u3, w1, u3, w1_pr, n); \ + tmp = u1; \ + u1 = u1 + u3; /* [0..4n) */ \ + u3 = tmp + n2 - u3; /* [0..4n) */ \ + \ + N_MULMOD_PRECOMP_LAZY(u1, w2, u1, w2_pr, n); \ + (a) = u0 + u1; /* [0..4n) */ \ + (b) = u0 + n2 - u1; /* [0..4n) */ \ + \ + N_MULMOD_PRECOMP_LAZY(u3, w3, u3, w3_pr, n); \ + (c) = u2 + u3; /* [0..4n) */ \ + (d) = u2 + n2 - u3; /* [0..4n) */ \ +} while(0) + +/** 4-point FFT, interpolation, general node + * * In-place transform + * [ 1 iw2 iw1 iw1*iw2] + * [ 1 -iw2 iw1 -iw1*iw2] + * [a b c d] <- [a b c d] [ 1 iw3 -iw1 -iw1*iw3] + * [ 1 -iw3 -iw1 iw1*iw3] + * [1 iw2 0 0] [1 0 w1 0] + * == [a b c d] [1 -iw2 0 0] [0 1 0 w1] + * [0 0 1 iw3] [1 0 -w1 0] + * [0 0 1 -iw3] [0 1 0 -w1] + * * Corresponds, up to scaling by 1/4, to going up the tree with nodes + * x^4 - w1**2 + * / \ + * x^2 - w1 x^2 + w1 + * / \ / \ + * x - w2 x + w2 x - w3 x + w3 + * typically w2**2 == w1 and w3 == I*w2 (hence w3**2 == -w1) so that the above + * is the inverse of a Vandermonde matrix and this tree really is the + * subproduct tree built from the four roots w2, -w2, I*w2, -I*w2 of x**4 - w1 + * * lazy_1_2: in [0..n) / out [0..2n) / max < 4n + * * lazy_2_2: in [0..2n) / out [0..2n) / max < 4n + */ +#define IDFT4_NODE_LAZY_2_2(a, b, c, d, \ + w1, w1_pr, w2, w2_pr, w3, w3_pr, \ + n, n2) \ +do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v1; /* < 4*n */ \ + if (v4 >= (n2)) \ + v4 -= (n2); /* < 2*n */ \ + ulong v5; \ + N_MULMOD_PRECOMP_LAZY(v5, (w2), v0 + (n2) - v1, (w2_pr), (n)); \ + ulong v6 = v2 + v3; /* < 4*n */ \ + if (v6 >= (n2)) \ + v6 -= (n2); /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (w3), v2 + (n2) - v3, (w3_pr), (n)); \ + \ + (a) = v4 + v6; \ + if ((a) >= (n2)) \ + (a) -= (n2); /* < 2*n */ \ + (b) = v5 + v7; \ + if ((b) >= (n2)) \ + (b) -= (n2); /* < 2*n */ \ + N_MULMOD_PRECOMP_LAZY((c), (w1), v4 + (n2) - v6, (w1_pr), (n)); \ + N_MULMOD_PRECOMP_LAZY((d), (w1), v5 + (n2) - v7, (w1_pr), (n)); \ +} while(0) + +#define IDFT4_NODE_LAZY_1_2(a, b, c, d, \ + w1, w1_pr, w2, w2_pr, w3, w3_pr, \ + n, n2) \ +do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v1; /* < 2*n */ \ + ulong v5; \ + N_MULMOD_PRECOMP_LAZY(v5, (w2), v0 + (n) - v1, (w2_pr), (n)); \ + ulong v6 = v2 + v3; /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (w3), v2 + (n) - v3, (w3_pr), (n)); \ + \ + (a) = v4 + v6; /* < 4*n */ \ + if ((a) >= (n2)) \ + (a) -= (n2); /* < 2*n */ \ + (b) = v5 + v7; /* < 4*n */ \ + if ((b) >= (n2)) \ + (b) -= (n2); /* < 2*n */ \ + N_MULMOD_PRECOMP_LAZY((c), (w1), v4 + (n2) - v6, (w1_pr), (n)); \ + N_MULMOD_PRECOMP_LAZY((d), (w1), v5 + (n2) - v7, (w1_pr), (n)); \ +} while(0) + +/*------------------*/ +/* length 8, node 0 */ +/*------------------*/ + +/** 8-point FFT, evaluation + * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial + * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations + * p(1), p(-1), p(I), p(-I), p(J), p(-J), p(I*J), p(-I*J) + * i.e. the evaluations at all 8-th roots of unity J**k for 0 <= k < 8 in + * bit-reversed order + * * Recall: [F->tab_w[2*k] for k in range(4)] == [1, I, J, IJ]) + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + * * lazy_2_4: in [0..2n) / out [0..4n) / max < 4n + */ +#define DFT8_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + n, n2, tab_w) \ +do { \ + DFT2_LAZY_1_2(p0, p4, n); \ + DFT2_LAZY_1_2(p1, p5, n); \ + DFT2_LAZY_1_2(p2, p6, n); \ + DFT2_LAZY_1_2(p3, p7, n); \ + \ + DFT4_LAZY_2_4(p0, p1, p2, p3, \ + tab_w[2], tab_w[3], \ + n, n2); \ + /* could use a lazy_2_4 variant of the */ \ + /* next one, but the gain is negligible */ \ + DFT4_NODE_LAZY_4_4(p4, p5, p6, p7, \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + n, n2); \ +} while(0) + +#define DFT8_LAZY_2_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + n, n2, tab_w) \ +do { \ + DFT2_LAZY_22_24(p0, p4, n2); \ + DFT2_LAZY_22_24(p1, p5, n2); \ + DFT2_LAZY_22_24(p2, p6, n2); \ + DFT2_LAZY_22_24(p3, p7, n2); \ + \ + DFT4_LAZY_2_4(p0, p1, p2, p3, \ + tab_w[2], tab_w[3], \ + n, n2); \ + DFT4_NODE_LAZY_4_4(p4, p5, p6, p7, \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + n, n2); \ +} while(0) + +/** 8-point FFT, interpolation + * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as the evaluations + * [p(1), p(-1), p(I), p(-I), p(J), p(-J), p(I*J), p(-I*J)] + * at all 8-th roots of unity J**k for 0 <= k < 8 in bit-reversed order + * of a polynomial p(x) of degree < 8, into the coefficients of this polynomial + * * Recall: [F->tab_w[2*k] for k in range(4)] == [1, I, J, IJ]) + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + */ +#define IDFT8_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + n, n2, tab_w) \ +do { \ + IDFT4_LAZY_1_4(p0, p1, p2, p3, \ + tab_w[2], tab_w[3], \ + n, n2); \ + IDFT4_NODE_LAZY_1_2(p4, p5, p6, p7, \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + n, n2); \ + \ + DFT2_LAZY_42_44(p0, p4, n2); \ + DFT2_LAZY_42_44(p1, p5, n2); \ + DFT2_LAZY_42_44(p2, p6, n2); \ + DFT2_LAZY_42_44(p3, p7, n2); \ +} while(0) + +/*------------------------*/ +/* length 8, general node */ +/*------------------------*/ + +/** 8-point FFT, evaluation, general node + * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial + * p(x) = p0 + p1*x + ... + p7*x**7, into its evaluations + * p(w0), p(-w0), p(w1), p(-w1), p(w2), p(-w2), p(w3), p(-w3) + * where w_k = F->tab_w[8*node + 2*k] for 0 <= k < 4 + * * By construction these 8 evaluation points are the 8 roots of the + * polynomial x**8 - F->tab_w[node] + * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n + */ +#define DFT8_NODE_LAZY_4_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + node, n, n2, tab_w) \ +do { \ + const ulong w = tab_w[2*(node)]; \ + const ulong w_pr = tab_w[2*(node)+1]; \ + DFT2_NODE_LAZY_4_4(p0, p4, w, w_pr, n, n2); \ + DFT2_NODE_LAZY_4_4(p1, p5, w, w_pr, n, n2); \ + DFT2_NODE_LAZY_4_4(p2, p6, w, w_pr, n, n2); \ + DFT2_NODE_LAZY_4_4(p3, p7, w, w_pr, n, n2); \ + \ + DFT4_NODE_LAZY_4_4(p0, p1, p2, p3, \ + tab_w[4*(node)], tab_w[4*(node)+1], \ + tab_w[8*(node)], tab_w[8*(node)+1], \ + tab_w[8*(node)+2], tab_w[8*(node)+3], \ + n, n2); \ + \ + DFT4_NODE_LAZY_4_4(p4, p5, p6, p7, \ + tab_w[4*(node)+2], tab_w[4*(node)+3], \ + tab_w[8*(node)+4], tab_w[8*(node)+5], \ + tab_w[8*(node)+6], tab_w[8*(node)+7], \ + n, n2); \ +} while(0) + +/** 8-point FFT, interpolation, general node + * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as the evaluations + * [p(w0), p(-w0), p(w1), p(-w1), p(w2), p(-w2), p(w3), p(-w3)] + * where w_k = F->tab_w[8*node + 2*k] for 0 <= k < 4 of a polynomial p(x) of + * degree < 8, into the coefficients of this polynomial + * * By construction these 8 evaluation points are the 8 roots of the + * polynomial x**8 - F->tab_w[node] + * * lazy_1_2: in [0..n) / out [0..2n) / max < 4n + * * lazy_2_2: in [0..2n) / out [0..2n) / max < 4n + */ +#define IDFT8_NODE_LAZY_1_2(p0, p1, p2, p3, p4, p5, p6, p7, \ + node, n, n2, tab_w) \ +do { \ + const ulong w = tab_w[2*(node)]; \ + const ulong w_pr = tab_w[2*(node)+1]; \ + \ + IDFT4_NODE_LAZY_1_2(p0, p1, p2, p3, \ + tab_w[4*(node)], tab_w[4*(node)+1], \ + tab_w[8*(node)], tab_w[8*(node)+1], \ + tab_w[8*(node)+2], tab_w[8*(node)+3], \ + n, n2); \ + \ + IDFT4_NODE_LAZY_1_2(p4, p5, p6, p7, \ + tab_w[4*(node)+2], tab_w[4*(node)+3], \ + tab_w[8*(node)+4], tab_w[8*(node)+5], \ + tab_w[8*(node)+6], tab_w[8*(node)+7], \ + n, n2); \ + \ + IDFT2_NODE_LAZY_2_2(p0, p4, w, w_pr, n, n2); \ + IDFT2_NODE_LAZY_2_2(p1, p5, w, w_pr, n, n2); \ + IDFT2_NODE_LAZY_2_2(p2, p6, w, w_pr, n, n2); \ + IDFT2_NODE_LAZY_2_2(p3, p7, w, w_pr, n, n2); \ +} while(0) + +/*-------------------*/ +/* length 16, node 0 */ +/*-------------------*/ + +/** 16-point FFT, evaluation + * * In-place transform p of length 16, seen as a polynomial + * p(x) = p0 + p1*x + ... + p15*x**15, into its evaluations + * at all 16-th roots of unity 1, -1, I, -I... (bit-reversed order) + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + * * lazy_2_4: in [0..2n) / out [0..4n) / max < 4n + */ +#define DFT16_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + n, n2, tab_w) \ +do { \ + DFT4_LAZY_1_4(p0, p4, p8, p12, tab_w[2], tab_w[3], n, n2); \ + if (p0 >= n2) \ + p0 -= n2; \ + DFT4_LAZY_1_4(p1, p5, p9, p13, tab_w[2], tab_w[3], n, n2); \ + if (p1 >= n2) \ + p1 -= n2; \ + DFT4_LAZY_1_4(p2, p6, p10, p14, tab_w[2], tab_w[3], n, n2); \ + if (p2 >= n2) \ + p2 -= n2; \ + DFT4_LAZY_1_4(p3, p7, p11, p15, tab_w[2], tab_w[3], n, n2); \ + if (p3 >= n2) \ + p3 -= n2; \ + \ + /* next line requires < 2n, */ \ + /* hence the four reductions above */ \ + DFT4_LAZY_2_4(p0, p1, p2, p3, tab_w[2], tab_w[3], n, n2); \ + DFT4_NODE_LAZY_4_4(p4, p5, p6, p7, \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + n, n2); \ + DFT4_NODE_LAZY_4_4(p8, p9, p10, p11, \ + tab_w[4], tab_w[5], \ + tab_w[8], tab_w[9], \ + tab_w[10], tab_w[11], \ + n, n2); \ + DFT4_NODE_LAZY_4_4(p12, p13, p14, p15, \ + tab_w[6], tab_w[7], \ + tab_w[12], tab_w[13], \ + tab_w[14], tab_w[15], \ + n, n2); \ +} while(0) + +#define DFT16_LAZY_2_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + n, n2, tab_w) \ +do { \ + DFT4_LAZY_2_4(p0, p4, p8, p12, tab_w[2], tab_w[3], n, n2); \ + if (p0 >= n2) \ + p0 -= n2; \ + DFT4_LAZY_2_4(p1, p5, p9, p13, tab_w[2], tab_w[3], n, n2); \ + if (p1 >= n2) \ + p1 -= n2; \ + DFT4_LAZY_2_4(p2, p6, p10, p14, tab_w[2], tab_w[3], n, n2); \ + if (p2 >= n2) \ + p2 -= n2; \ + DFT4_LAZY_2_4(p3, p7, p11, p15, tab_w[2], tab_w[3], n, n2); \ + if (p3 >= n2) \ + p3 -= n2; \ + \ + /* next line requires < 2n, */ \ + /* hence the four reductions above */ \ + DFT4_LAZY_2_4(p0, p1, p2, p3, tab_w[2], tab_w[3], n, n2); \ + DFT4_NODE_LAZY_4_4(p4, p5, p6, p7, \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + n, n2); \ + DFT4_NODE_LAZY_4_4(p8, p9, p10, p11, \ + tab_w[4], tab_w[5], \ + tab_w[8], tab_w[9], \ + tab_w[10], tab_w[11], \ + n, n2); \ + DFT4_NODE_LAZY_4_4(p12, p13, p14, p15, \ + tab_w[6], tab_w[7], \ + tab_w[12], tab_w[13], \ + tab_w[14], tab_w[15], \ + n, n2); \ +} while(0) + +/** 16-point FFT, interpolation + * * In-place transform p of length 16, seen as the evaluations at all 16-th + * roots of unity 1, -1, I, -I... (bit-reversed order) of a polynomial p(x) of + * degree < 16, into the coefficients of this polynomial + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + */ +#define IDFT16_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + n, n2, tab_w) \ +do { \ + IDFT4_LAZY_1_4(p0, p1, p2, p3, tab_w[2], tab_w[3], n, n2); \ + IDFT4_NODE_LAZY_1_2(p4, p5, p6, p7, \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + n, n2); \ + IDFT4_NODE_LAZY_1_2(p8, p9, p10, p11, \ + tab_w[4], tab_w[5], \ + tab_w[8], tab_w[9], \ + tab_w[10], tab_w[11], \ + n, n2); \ + IDFT4_NODE_LAZY_1_2(p12, p13, p14, p15, \ + tab_w[6], tab_w[7], \ + tab_w[12], tab_w[13], \ + tab_w[14], tab_w[15], \ + n, n2); \ + \ + IDFT4_LAZY_4222_4(p0, p4, p8, p12, tab_w[2], tab_w[3], n, n2); \ + IDFT4_LAZY_4222_4(p1, p5, p9, p13, tab_w[2], tab_w[3], n, n2); \ + IDFT4_LAZY_4222_4(p2, p6, p10, p14, tab_w[2], tab_w[3], n, n2); \ + IDFT4_LAZY_4222_4(p3, p7, p11, p15, tab_w[2], tab_w[3], n, n2); \ +} while(0) + +/*-------------------------*/ +/* length 16, general node */ +/*-------------------------*/ + +/** 16-point FFT, evaluation, general node + * * In-place transform p of length 16, seen as a polynomial + * p(x) = p0 + p1*x + ... + p15*x**15, into its evaluations at + * p(w0), p(-w0), p(w1), p(-w1), ..., p(w7), p(-w7) + * where w_k = F->tab_w[16*node + 2*k] for 0 <= k < 8 + * * By construction these 16 evaluation points are the 16 roots of the + * polynomial x**16 - F->tab_w[node] + * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n + */ +#define DFT16_NODE_LAZY_4_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + node, n, n2, tab_w) \ +do { \ + ulong w2, w2pre, w, wpre, Iw, Iwpre; \ + \ + w2 = tab_w[2*node]; \ + w2pre = tab_w[2*node+1]; \ + w = tab_w[4*node]; \ + wpre = tab_w[4*node+1]; \ + Iw = tab_w[4*node+2]; \ + Iwpre = tab_w[4*node+3]; \ + \ + DFT4_NODE_LAZY_4_4(p0, p4, p8, p12, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + DFT4_NODE_LAZY_4_4(p1, p5, p9, p13, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + DFT4_NODE_LAZY_4_4(p2, p6, p10, p14, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + DFT4_NODE_LAZY_4_4(p3, p7, p11, p15, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + \ + w2 = tab_w[8*node]; \ + w2pre = tab_w[8*node+1]; \ + w = tab_w[16*node]; \ + wpre = tab_w[16*node+1]; \ + Iw = tab_w[16*node+2]; \ + Iwpre = tab_w[16*node+3]; \ + DFT4_NODE_LAZY_4_4(p0, p1, p2, p3, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + \ + w2 = tab_w[8*node+2]; \ + w2pre = tab_w[8*node+3]; \ + w = tab_w[16*node+4]; \ + wpre = tab_w[16*node+5]; \ + Iw = tab_w[16*node+6]; \ + Iwpre = tab_w[16*node+7]; \ + DFT4_NODE_LAZY_4_4(p4, p5, p6, p7, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + \ + w2 = tab_w[8*node+4]; \ + w2pre = tab_w[8*node+5]; \ + w = tab_w[16*node+8]; \ + wpre = tab_w[16*node+9]; \ + Iw = tab_w[16*node+10]; \ + Iwpre = tab_w[16*node+11]; \ + DFT4_NODE_LAZY_4_4(p8, p9, p10, p11, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + \ + w2 = tab_w[8*node+6]; \ + w2pre = tab_w[8*node+7]; \ + w = tab_w[16*node+12]; \ + wpre = tab_w[16*node+13]; \ + Iw = tab_w[16*node+14]; \ + Iwpre = tab_w[16*node+15]; \ + DFT4_NODE_LAZY_4_4(p12, p13, p14, p15, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ +} while(0) + +/** 16-point FFT, interpolation, general node + * * In-place transform p of length 16, seen as the evaluations at + * w0, -w0, w1, -w1, ..., w7, -w7 + * where w_k = F->tab_w[16*node + 2*k] for 0 <= k < 8 + * of a polynomial of degree < 16, into the coefficients of this polynomial + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + */ +#define IDFT16_NODE_LAZY_1_2(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + node, n, n2, tab_w) \ +do { \ + ulong w2, w2pre, w, wpre, Iw, Iwpre; \ + \ + w2 = tab_w[8*node]; \ + w2pre = tab_w[8*node+1]; \ + w = tab_w[16*node]; \ + wpre = tab_w[16*node+1]; \ + Iw = tab_w[16*node+2]; \ + Iwpre = tab_w[16*node+3]; \ + IDFT4_NODE_LAZY_1_2(p0, p1, p2, p3, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + \ + w2 = tab_w[8*node+2]; \ + w2pre = tab_w[8*node+3]; \ + w = tab_w[16*node+4]; \ + wpre = tab_w[16*node+5]; \ + Iw = tab_w[16*node+6]; \ + Iwpre = tab_w[16*node+7]; \ + IDFT4_NODE_LAZY_1_2(p4, p5, p6, p7, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + \ + w2 = tab_w[8*node+4]; \ + w2pre = tab_w[8*node+5]; \ + w = tab_w[16*node+8]; \ + wpre = tab_w[16*node+9]; \ + Iw = tab_w[16*node+10]; \ + Iwpre = tab_w[16*node+11]; \ + IDFT4_NODE_LAZY_1_2(p8, p9, p10, p11, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + \ + w2 = tab_w[8*node+6]; \ + w2pre = tab_w[8*node+7]; \ + w = tab_w[16*node+12]; \ + wpre = tab_w[16*node+13]; \ + Iw = tab_w[16*node+14]; \ + Iwpre = tab_w[16*node+15]; \ + IDFT4_NODE_LAZY_1_2(p12, p13, p14, p15, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + \ + w2 = tab_w[2*node]; \ + w2pre = tab_w[2*node+1]; \ + w = tab_w[4*node]; \ + wpre = tab_w[4*node+1]; \ + Iw = tab_w[4*node+2]; \ + Iwpre = tab_w[4*node+3]; \ + \ + IDFT4_NODE_LAZY_2_2(p0, p4, p8, p12, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + IDFT4_NODE_LAZY_2_2(p1, p5, p9, p13, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + IDFT4_NODE_LAZY_2_2(p2, p6, p10, p14, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ + IDFT4_NODE_LAZY_2_2(p3, p7, p11, p15, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + n, n2); \ +} while(0) + + +/*-------------------*/ +/* length 32, node 0 */ +/*-------------------*/ + +/** 32-point FFT, evaluation + * * In-place transform p of length 32, seen as a polynomial + * p(x) = p0 + p1*x + ... + p31*x**31, into its evaluations + * at all 32-th roots of unity 1, -1, I, -I... (bit-reversed order) + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + * * lazy_2_4: in [0..2n) / out [0..4n) / max < 4n + */ +#define DFT32_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + p16, p17, p18, p19, p20, p21, p22, p23, \ + p24, p25, p26, p27, p28, p29, p30, p31, \ + n, n2, tab_w) \ +do { \ + DFT4_LAZY_1_4(p0, p8, p16, p24, tab_w[2], tab_w[3], n, n2); \ + if (p0 >= n2) \ + p0 -= n2; \ + DFT4_LAZY_1_4(p1, p9, p17, p25, tab_w[2], tab_w[3], n, n2); \ + if (p1 >= n2) \ + p1 -= n2; \ + DFT4_LAZY_1_4(p2, p10, p18, p26, tab_w[2], tab_w[3], n, n2); \ + if (p2 >= n2) \ + p2 -= n2; \ + DFT4_LAZY_1_4(p3, p11, p19, p27, tab_w[2], tab_w[3], n, n2); \ + if (p3 >= n2) \ + p3 -= n2; \ + DFT4_LAZY_1_4(p4, p12, p20, p28, tab_w[2], tab_w[3], n, n2); \ + if (p4 >= n2) \ + p4 -= n2; \ + DFT4_LAZY_1_4(p5, p13, p21, p29, tab_w[2], tab_w[3], n, n2); \ + if (p5 >= n2) \ + p5 -= n2; \ + DFT4_LAZY_1_4(p6, p14, p22, p30, tab_w[2], tab_w[3], n, n2); \ + if (p6 >= n2) \ + p6 -= n2; \ + DFT4_LAZY_1_4(p7, p15, p23, p31, tab_w[2], tab_w[3], n, n2); \ + if (p7 >= n2) \ + p7 -= n2; \ + \ + /* next line requires < 2n, hence the 8 reductions above */ \ + DFT8_LAZY_2_4(p0, p1, p2, p3, p4, p5, p6, p7, n, n2, tab_w); \ + DFT8_NODE_LAZY_4_4(p8, p9, p10, p11, p12, p13, p14, p15, 1, n, n2, tab_w); \ + DFT8_NODE_LAZY_4_4(p16, p17, p18, p19, p20, p21, p22, p23, 2, n, n2, tab_w); \ + DFT8_NODE_LAZY_4_4(p24, p25, p26, p27, p28, p29, p30, p31, 3, n, n2, tab_w); \ +} while(0) + +#define DFT32_LAZY_2_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + p16, p17, p18, p19, p20, p21, p22, p23, \ + p24, p25, p26, p27, p28, p29, p30, p31, \ + n, n2, tab_w) \ +do { \ + DFT4_LAZY_2_4(p0, p8, p16, p24, tab_w[2], tab_w[3], n, n2); \ + if (p0 >= n2) \ + p0 -= n2; \ + DFT4_LAZY_2_4(p1, p9, p17, p25, tab_w[2], tab_w[3], n, n2); \ + if (p1 >= n2) \ + p1 -= n2; \ + DFT4_LAZY_2_4(p2, p10, p18, p26, tab_w[2], tab_w[3], n, n2); \ + if (p2 >= n2) \ + p2 -= n2; \ + DFT4_LAZY_2_4(p3, p11, p19, p27, tab_w[2], tab_w[3], n, n2); \ + if (p3 >= n2) \ + p3 -= n2; \ + DFT4_LAZY_2_4(p4, p12, p20, p28, tab_w[2], tab_w[3], n, n2); \ + if (p4 >= n2) \ + p4 -= n2; \ + DFT4_LAZY_2_4(p5, p13, p21, p29, tab_w[2], tab_w[3], n, n2); \ + if (p5 >= n2) \ + p5 -= n2; \ + DFT4_LAZY_2_4(p6, p14, p22, p30, tab_w[2], tab_w[3], n, n2); \ + if (p6 >= n2) \ + p6 -= n2; \ + DFT4_LAZY_2_4(p7, p15, p23, p31, tab_w[2], tab_w[3], n, n2); \ + if (p7 >= n2) \ + p7 -= n2; \ + \ + /* next line requires < 2n, hence the 8 reductions above */ \ + DFT8_LAZY_2_4(p0, p1, p2, p3, p4, p5, p6, p7, n, n2, tab_w); \ + DFT8_NODE_LAZY_4_4(p8, p9, p10, p11, p12, p13, p14, p15, 1, n, n2, tab_w); \ + DFT8_NODE_LAZY_4_4(p16, p17, p18, p19, p20, p21, p22, p23, 2, n, n2, tab_w); \ + DFT8_NODE_LAZY_4_4(p24, p25, p26, p27, p28, p29, p30, p31, 3, n, n2, tab_w); \ +} while(0) + +/** 32-point FFT, interpolation + * * In-place transform p of length 32, seen as the evaluations at all 32-th + * roots of unity 1, -1, I, -I... (bit-reversed order) of a polynomial p(x) of + * degree < 32, into the coefficients of this polynomial + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + */ +#define IDFT32_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + p16, p17, p18, p19, p20, p21, p22, p23, \ + p24, p25, p26, p27, p28, p29, p30, p31, \ + n, n2, tab_w) \ +do { \ + IDFT8_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7, n, n2, tab_w); \ + IDFT8_NODE_LAZY_1_2(p8, p9, p10, p11, p12, p13, p14, p15, 1, n, n2, tab_w); \ + IDFT8_NODE_LAZY_1_2(p16, p17, p18, p19, p20, p21, p22, p23, 2, n, n2, tab_w); \ + IDFT8_NODE_LAZY_1_2(p24, p25, p26, p27, p28, p29, p30, p31, 3, n, n2, tab_w); \ + \ + IDFT4_LAZY_4222_4(p0, p8, p16, p24, tab_w[2], tab_w[3], n, n2); \ + IDFT4_LAZY_4222_4(p1, p9, p17, p25, tab_w[2], tab_w[3], n, n2); \ + IDFT4_LAZY_4222_4(p2, p10, p18, p26, tab_w[2], tab_w[3], n, n2); \ + IDFT4_LAZY_4222_4(p3, p11, p19, p27, tab_w[2], tab_w[3], n, n2); \ + IDFT4_LAZY_4222_4(p4, p12, p20, p28, tab_w[2], tab_w[3], n, n2); \ + IDFT4_LAZY_4222_4(p5, p13, p21, p29, tab_w[2], tab_w[3], n, n2); \ + IDFT4_LAZY_4222_4(p6, p14, p22, p30, tab_w[2], tab_w[3], n, n2); \ + IDFT4_LAZY_4222_4(p7, p15, p23, p31, tab_w[2], tab_w[3], n, n2); \ +} while(0) + +/*-------------------------*/ +/* length 32, general node */ +/*-------------------------*/ + +/** 32-point FFT, evaluation, general node + * * In-place transform p of length 32, seen as a polynomial + * p(x) = p0 + p1*x + ... + p31*x**31, into its evaluations at + * p(w0), p(-w0), p(w1), p(-w1), ..., p(w15), p(-w15) + * where w_k = F->tab_w[32*node + 2*k] for 0 <= k < 16 + * * By construction these 32 evaluation points are the 32 roots of the + * polynomial x**32 - F->tab_w[node] + * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n + */ +#define DFT32_NODE_LAZY_4_4(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + p16, p17, p18, p19, p20, p21, p22, p23, \ + p24, p25, p26, p27, p28, p29, p30, p31, \ + node, n, n2, tab_w) \ +do { \ + ulong w2 = tab_w[2*node]; \ + ulong w2pre = tab_w[2*node+1]; \ + ulong w = tab_w[4*node]; \ + ulong wpre = tab_w[4*node+1]; \ + ulong Iw = tab_w[4*node+2]; \ + ulong Iwpre = tab_w[4*node+3]; \ + DFT4_NODE_LAZY_4_4(p0, p8, p16, p24, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + DFT4_NODE_LAZY_4_4(p1, p9, p17, p25, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + DFT4_NODE_LAZY_4_4(p2, p10, p18, p26, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + DFT4_NODE_LAZY_4_4(p3, p11, p19, p27, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + DFT4_NODE_LAZY_4_4(p4, p12, p20, p28, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + DFT4_NODE_LAZY_4_4(p5, p13, p21, p29, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + DFT4_NODE_LAZY_4_4(p6, p14, p22, p30, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + DFT4_NODE_LAZY_4_4(p7, p15, p23, p31, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + \ + DFT8_NODE_LAZY_4_4(p0, p1, p2, p3, p4, p5, p6, p7, 4*node, n, n2, tab_w); \ + DFT8_NODE_LAZY_4_4(p8, p9, p10, p11, p12, p13, p14, p15, 4*node+1, n, n2, tab_w); \ + DFT8_NODE_LAZY_4_4(p16, p17, p18, p19, p20, p21, p22, p23, 4*node+2, n, n2, tab_w); \ + DFT8_NODE_LAZY_4_4(p24, p25, p26, p27, p28, p29, p30, p31, 4*node+3, n, n2, tab_w); \ +} while(0) + +/** 32-point FFT, interpolation, general node + * * In-place transform p of length 32, seen as the evaluations at + * w0, -w0, w1, -w1, ..., w15, -w15 + * where w_k = F->tab_w[32*node + 2*k] for 0 <= k < 16 of a polynomial of + * degree < 32, into the coefficients of this polynomial + * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n + */ +#define IDFT32_NODE_LAZY_1_2(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + p16, p17, p18, p19, p20, p21, p22, p23, \ + p24, p25, p26, p27, p28, p29, p30, p31, \ + node, n, n2, tab_w) \ +do { \ + IDFT8_NODE_LAZY_1_2(p0, p1, p2, p3, p4, p5, p6, p7, 4*node, n, n2, tab_w); \ + IDFT8_NODE_LAZY_1_2(p8, p9, p10, p11, p12, p13, p14, p15, 4*node+1, n, n2, tab_w); \ + IDFT8_NODE_LAZY_1_2(p16, p17, p18, p19, p20, p21, p22, p23, 4*node+2, n, n2, tab_w); \ + IDFT8_NODE_LAZY_1_2(p24, p25, p26, p27, p28, p29, p30, p31, 4*node+3, n, n2, tab_w); \ + \ + ulong w2 = tab_w[2*node]; \ + ulong w2pre = tab_w[2*node+1]; \ + ulong w = tab_w[4*node]; \ + ulong wpre = tab_w[4*node+1]; \ + ulong Iw = tab_w[4*node+2]; \ + ulong Iwpre = tab_w[4*node+3]; \ + IDFT4_NODE_LAZY_2_2(p0, p8, p16, p24, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + IDFT4_NODE_LAZY_2_2(p1, p9, p17, p25, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + IDFT4_NODE_LAZY_2_2(p2, p10, p18, p26, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + IDFT4_NODE_LAZY_2_2(p3, p11, p19, p27, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + IDFT4_NODE_LAZY_2_2(p4, p12, p20, p28, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + IDFT4_NODE_LAZY_2_2(p5, p13, p21, p29, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + IDFT4_NODE_LAZY_2_2(p6, p14, p22, p30, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ + IDFT4_NODE_LAZY_2_2(p7, p15, p23, p31, w2, w2pre, w, wpre, Iw, Iwpre, n, n2); \ +} while(0) + +#endif /* N_FFT_MACROS_H */ diff --git a/src/n_fft/profile/p-dft.c b/src/n_fft/profile/p-dft.c new file mode 100644 index 0000000000..b37a804c85 --- /dev/null +++ b/src/n_fft/profile/p-dft.c @@ -0,0 +1,197 @@ +#include "nmod_poly.h" +#include "profiler.h" +#include "nmod_vec.h" +#include "fft_small.h" +#include "n_fft.h" + +#define NUM_PRIMES 7 + +typedef struct +{ + ulong prime; + ulong depth; +} info_t; + +#define SAMPLE(fun, _variant) \ +void sample_##fun##_variant(void * arg, ulong count) \ +{ \ + info_t * info = (info_t *) arg; \ + const ulong p = info->prime; \ + const ulong depth = info->depth; \ + \ + const ulong len = (UWORD(1) << depth); \ + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); \ + \ + /* modulus, roots of unity */ \ + n_fft_ctx_t F; \ + n_fft_ctx_init2(F, depth, p); \ + \ + FLINT_TEST_INIT(state); \ + \ + ulong * coeffs = _nmod_vec_init(len); \ + for (ulong k = 0; k < len; k++) \ + coeffs[k] = n_randint(state, p); \ + \ + for (ulong i = 0; i < count; i++) \ + { \ + prof_start(); \ + for (ulong j = 0; j < rep; j++) \ + n_fft_##fun##_variant(coeffs, depth, F); \ + prof_stop(); \ + } \ + \ + _nmod_vec_clear(coeffs); \ + n_fft_ctx_clear(F); \ + FLINT_TEST_CLEAR(state); \ +} \ + +SAMPLE(dft, ) +SAMPLE(idft, ) +SAMPLE(dft_t, ) +SAMPLE(idft_t, ) + +void sample_sd_fft(void * arg, ulong count) +{ + info_t * info = (info_t *) arg; + const ulong p = info->prime; + const ulong depth = info->depth; + + const ulong len = UWORD(1) << depth; + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); + + sd_fft_ctx_t Q; + sd_fft_ctx_init_prime(Q, p); + sd_fft_ctx_fit_depth(Q, depth); + + ulong sz = sd_fft_ctx_data_size(depth)*sizeof(double); + + FLINT_TEST_INIT(state); + + nmod_t mod; + nmod_init(&mod, p); + ulong * coeffs = _nmod_vec_init(len); + _nmod_vec_randtest(coeffs, state, len, mod); + + double* data = flint_aligned_alloc(4096, n_round_up(sz, 4096)); + for (ulong i = 0; i < len; i++) + data[i] = coeffs[i]; + + for (ulong i = 0; i < count; i++) + { + prof_start(); + for (ulong j = 0; j < rep; j++) + sd_fft_trunc(Q, data, depth, len, len); + prof_stop(); + } + + sd_fft_ctx_clear(Q); + FLINT_TEST_CLEAR(state); +} + +int main() +{ + flint_printf("- depth is log(fft length)\n"); + flint_printf("- timing DFT (length power of 2) for several bit lengths and depths\n"); + flint_printf("depth\tsd_fft\tdft\tidft\tdft_t\tidft_t\n"); + + ulong primes[NUM_PRIMES] = { + 786433, // 20 bits, 1 + 2**18 * 3 + 1073479681, // 30 bits, 1 + 2**30 - 2**18 == 1 + 2**18 * (2**12 - 1) + 2013265921, // 31 bits, 1 + 2**27 * 3 * 5 + 2748779069441, // 42 bits, 1 + 2**39 * 5 + 1108307720798209, // 50 bits, 1 + 2**44 * 3**2 * 7 + 1139410705724735489, // 60 bits, 1 + 2**52 * 11 * 23 + 4611686018427322369 // 62 bits: 1 + 2**62 - 2**16 == 1 + 2**16 * (2**46 - 1) + }; + ulong max_depths[NUM_PRIMES] = { 18, 18, 25, 25, 25, 25, 16 }; + + for (ulong k = 4; k < 6; k++) + { + for (ulong depth = 3; depth <= max_depths[k]; depth++) + { + printf("%ld\t", depth); + + info_t info; + info.prime = primes[k]; + info.depth = depth; + + const ulong len = UWORD(1) << depth; + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); + + double min[5]; + double max; + + if (k < 5) prof_repeat(min+0, &max, sample_sd_fft, (void *) &info); + prof_repeat(min+1, &max, sample_dft, (void *) &info); + prof_repeat(min+2, &max, sample_idft, (void *) &info); + prof_repeat(min+3, &max, sample_dft_t, (void *) &info); + prof_repeat(min+4, &max, sample_idft_t, (void *) &info); + + flint_printf("%.1e\t%.1e\t%.1e\t%.1e\t%.1e\n", + min[0]/(double)1000000/rep, + min[1]/(double)1000000/rep, + min[2]/(double)1000000/rep, + min[3]/(double)1000000/rep, + min[4]/(double)1000000/rep + ); + } + } + return 0; +} + +/** 50 bit prime, commit "introduce_nmod_fft f1852d1c5" + * + * Output on zen4 (AMD Ryzen 7 PRO 7840U) + * + * depth sd_fft dft idft dft_t idft_t + * 3 1.5e-08 2.2e-08 2.0e-08 2.3e-08 1.8e-08 + * 4 2.1e-08 4.4e-08 4.5e-08 4.3e-08 4.7e-08 + * 5 2.7e-08 9.3e-08 1.1e-07 9.5e-08 1.1e-07 + * 6 6.2e-08 2.2e-07 2.3e-07 2.0e-07 2.6e-07 + * 7 1.2e-07 5.0e-07 5.9e-07 5.1e-07 5.6e-07 + * 8 2.9e-07 1.2e-06 1.2e-06 1.1e-06 1.3e-06 + * 9 5.7e-07 2.6e-06 2.8e-06 2.7e-06 2.8e-06 + * 10 1.3e-06 5.7e-06 5.6e-06 5.2e-06 6.1e-06 + * 11 2.9e-06 1.2e-05 1.3e-05 1.2e-05 1.3e-05 + * 12 6.0e-06 2.7e-05 2.6e-05 2.5e-05 2.8e-05 + * 13 1.3e-05 5.6e-05 6.0e-05 5.7e-05 6.0e-05 + * 14 2.9e-05 1.2e-04 1.2e-04 1.1e-04 1.3e-04 + * 15 5.9e-05 2.6e-04 2.7e-04 2.6e-04 2.7e-04 + * 16 1.2e-04 5.6e-04 5.6e-04 5.1e-04 5.8e-04 + * 17 2.7e-04 1.2e-03 1.2e-03 1.2e-03 1.2e-03 + * 18 5.8e-04 2.5e-03 2.4e-03 2.3e-03 2.6e-03 + * 19 1.2e-03 5.2e-03 5.4e-03 5.1e-03 5.4e-03 + * 20 2.6e-03 1.1e-02 1.1e-02 1.0e-02 1.2e-02 + * 21 6.0e-03 2.3e-02 2.3e-02 2.3e-02 2.4e-02 + * 22 1.3e-02 5.0e-02 4.9e-02 4.6e-02 5.1e-02 + * 23 2.8e-02 1.0e-01 1.1e-01 1.0e-01 1.1e-01 + * 24 6.2e-02 2.2e-01 2.3e-01 2.0e-01 2.3e-01 + * 25 1.3e-01 4.5e-01 4.5e-01 4.4e-01 4.7e-01 + * + * Output on meteorlake (Intel(R) Core(TM) Ultra 7 165H) + * + * depth sd_fft dft idft dft_t idft_t + * 3 1.9e-08 2.1e-08 1.6e-08 2.4e-08 1.3e-08 + * 4 2.2e-08 4.6e-08 3.6e-08 4.5e-08 3.7e-08 + * 5 3.0e-08 9.5e-08 9.8e-08 1.0e-07 9.0e-08 + * 6 6.4e-08 2.3e-07 2.0e-07 2.0e-07 2.4e-07 + * 7 1.3e-07 5.3e-07 5.0e-07 5.2e-07 5.3e-07 + * 8 2.8e-07 1.2e-06 9.5e-07 9.8e-07 1.2e-06 + * 9 6.4e-07 2.6e-06 2.3e-06 2.4e-06 2.6e-06 + * 10 1.4e-06 5.7e-06 4.5e-06 4.6e-06 5.6e-06 + * 11 3.0e-06 1.3e-05 1.1e-05 1.1e-05 1.3e-05 + * 12 6.4e-06 2.7e-05 2.0e-05 2.1e-05 2.7e-05 + * 13 1.4e-05 5.8e-05 4.8e-05 4.9e-05 5.8e-05 + * 14 3.0e-05 1.2e-04 9.2e-05 9.6e-05 1.2e-04 + * 15 6.3e-05 2.6e-04 2.1e-04 2.2e-04 2.5e-04 + * 16 1.3e-04 5.4e-04 4.1e-04 4.2e-04 5.5e-04 + * 17 2.8e-04 1.1e-03 9.4e-04 9.6e-04 1.1e-03 + * 18 6.3e-04 2.4e-03 1.9e-03 2.0e-03 2.5e-03 + * 19 1.3e-03 5.2e-03 4.3e-03 4.4e-03 5.1e-03 + * 20 2.9e-03 1.1e-02 8.7e-03 8.9e-03 1.1e-02 + * 21 6.4e-03 2.4e-02 2.1e-02 2.0e-02 2.4e-02 + * 22 1.5e-02 5.3e-02 4.0e-02 4.1e-02 5.2e-02 + * 23 3.0e-02 1.1e-01 9.2e-02 9.1e-02 1.1e-01 + * 24 6.3e-02 2.3e-01 1.9e-01 1.8e-01 2.3e-01 + * 25 1.4e-01 4.7e-01 4.1e-01 4.1e-01 4.7e-01 + */ diff --git a/src/n_fft/profile/p-init.c b/src/n_fft/profile/p-init.c new file mode 100644 index 0000000000..f19117066a --- /dev/null +++ b/src/n_fft/profile/p-init.c @@ -0,0 +1,126 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "flint.h" +#include "nmod.h" +#include "profiler.h" +#include "n_fft.h" + +#define num_primes 5 + +typedef struct +{ + ulong prime; + ulong depth; + ulong maxdepth; +} info_t; + +void sample_init2_root(void * arg, ulong count) +{ + info_t * info = (info_t *) arg; + ulong p = info->prime; + ulong depth = info->depth; + ulong maxdepth = info->maxdepth; + + const ulong len = UWORD(1) << depth; + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); + + // modulus, roots of unity + nmod_t mod; + nmod_init(&mod, p); + ulong cofactor = (p - 1) >> maxdepth; + ulong w0 = nmod_pow_ui(n_primitive_root_prime(p), cofactor, mod); + ulong w = nmod_pow_ui(w0, 1UL<<(maxdepth - depth), mod); + + FLINT_TEST_INIT(state); + + for (ulong i = 0; i < count; i++) + { + prof_start(); + for (ulong j = 0; j < rep; j++) + { + n_fft_ctx_t F; + n_fft_ctx_init2_root(F, w, depth, cofactor, depth, p); + n_fft_ctx_clear(F); + } + prof_stop(); + } + + FLINT_TEST_CLEAR(state); +} + +/*-----------------------------------------------------------------*/ +/* initialize context for FFT for several bit lengths and depths */ +/*-----------------------------------------------------------------*/ +void time_fft_init(ulong * primes, ulong * max_depths) +{ + for (ulong depth = 3; depth <= 25; depth++) + { + printf("%ld\t", depth); + for (ulong k = 0; k < num_primes; k++) + { + if (depth <= max_depths[k]) + { + info_t info; + info.prime = primes[k]; + info.maxdepth = max_depths[k]; + info.depth = depth; + + const ulong len = UWORD(1) << depth; + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); + + double min; + double max; + + prof_repeat(&min, &max, sample_init2_root, (void *) &info); + + flint_printf("%.1e|%.1e\t", + min/(double)1000000/rep, + min/(double)FLINT_CLOCK_SCALE_FACTOR/len/rep + ); + } + else + flint_printf(" na | na \t"); + } + flint_printf("\n"); + } + +} + +/*------------------------------------------------------------*/ +/* main just calls time_init_set() */ +/*------------------------------------------------------------*/ +int main() +{ + printf("- depth == precomputing w**k, 0 <= k < 2**depth\n"); + printf("- timing init FFT context + clear at this depth:\n"); + printf(" t_raw == raw time\n"); + printf(" t_unit == raw time divided by 2**depth * clock scale factor\n"); + printf("\n"); + + printf(" \t 20 bits \t 31 bits \t 42 bits \t 50 bits \t 60 bits \n"); + printf("depth\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\n"); + + // TODO fix for FLINT_BITS==32 + ulong primes[num_primes] = { + 786433, // 20 bits, 1 + 2**18 * 3 + 2013265921, // 31 bits, 1 + 2**27 * 3 * 5 + 2748779069441, // 42 bits, 1 + 2**39 * 5 + 1108307720798209, // 50 bits, 1 + 2**44 * 3**2 * 7 + 1139410705724735489, // 60 bits, 1 + 2**52 * 11 * 23 + }; + ulong max_depths[num_primes] = { 18, 25, 25, 25, 25 }; + + time_fft_init(primes, max_depths); + + return 0; +} + diff --git a/src/n_fft/test/main.c b/src/n_fft/test/main.c new file mode 100644 index 0000000000..5c82383b68 --- /dev/null +++ b/src/n_fft/test/main.c @@ -0,0 +1,33 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +/* Include functions *********************************************************/ + +#include "t-init.c" +#include "t-dft.c" +#include "t-idft.c" +#include "t-dft_t.c" +#include "t-idft_t.c" + +/* Array of test functions ***************************************************/ + +test_struct tests[] = +{ + TEST_FUNCTION(n_fft_ctx_init2), + TEST_FUNCTION(n_fft_dft), + TEST_FUNCTION(n_fft_idft), + TEST_FUNCTION(n_fft_dft_t), + TEST_FUNCTION(n_fft_idft_t), +}; + +/* main function *************************************************************/ + +TEST_MAIN(tests) diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c new file mode 100644 index 0000000000..e6808a5e80 --- /dev/null +++ b/src/n_fft/test/t-dft.c @@ -0,0 +1,108 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "flint.h" +#include "test_helpers.h" +#include "ulong_extras.h" +#include "nmod.h" +#include "nmod_poly.h" +#include "nmod_vec.h" +#include "n_fft.h" + +#define MAX_EVAL_DEPTH 11 + +TEST_FUNCTION_START(n_fft_dft, state) +{ + int i; + + for (i = 0; i < 200 * flint_test_multiplier(); i++) + { + // take some FFT prime p with max_depth >= 10 + ulong max_depth, prime; + + // half of tests == fixed large prime, close to limit + // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1 + // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1 + if (i > 100) +#if FLINT_BITS == 64 + prime = UWORD(4611686018427322369); +#else // FLINT_BITS == 32 + prime = UWORD(1073479681); +#endif + else + { + max_depth = MAX_EVAL_DEPTH + n_randint(state, 6); + prime = 1 + (UWORD(1) << max_depth); + while (! n_is_prime(prime)) + prime += (UWORD(1) << max_depth); + } + max_depth = flint_ctz(prime-1); + + nmod_t mod; + nmod_init(&mod, prime); + + // init FFT root tables + n_fft_ctx_t F; + n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime); + + // retrieve roots, used later for multipoint evaluation + nn_ptr roots = flint_malloc((UWORD(1) << MAX_EVAL_DEPTH) * sizeof(ulong)); + for (ulong k = 0; k < (UWORD(1) << (MAX_EVAL_DEPTH-1)); k++) + { + roots[2*k] = F->tab_w[2*k]; + roots[2*k+1] = prime - F->tab_w[2*k]; // < prime since F->tab_w[2*k] != 0 + } + + for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++) + { + const ulong len = (UWORD(1) << depth); + + // choose random poly of degree < len + nmod_poly_t pol; + nmod_poly_init(pol, mod.n); + nmod_poly_randtest(pol, state, len); + // copy it for DFT + nn_ptr p = _nmod_vec_init(len); + _nmod_vec_set(p, pol->coeffs, len); + + // evals via general multipoint evaluation + nn_ptr evals_br = _nmod_vec_init(len); + nmod_poly_evaluate_nmod_vec(evals_br, pol, roots, len); + + // evals by DFT + n_fft_dft(p, depth, F); + + int res = _nmod_vec_equal(evals_br, p, len); + + if (!res) + { + TEST_FUNCTION_FAIL( + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "failed equality test\n", + prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); + } + + _nmod_vec_clear(p); + nmod_poly_clear(pol); + _nmod_vec_clear(evals_br); + } + + flint_free(roots); + n_fft_ctx_clear(F); + } + + TEST_FUNCTION_END(state); +} + +#undef MAX_EVAL_DEPTH diff --git a/src/n_fft/test/t-dft_t.c b/src/n_fft/test/t-dft_t.c new file mode 100644 index 0000000000..aa0e1d676e --- /dev/null +++ b/src/n_fft/test/t-dft_t.c @@ -0,0 +1,130 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "flint.h" +#include "test_helpers.h" +#include "ulong_extras.h" +#include "nmod.h" +#include "nmod_vec.h" +#include "n_fft.h" + +#define MAX_EVAL_DEPTH 9 + +/** computes the weighted power sums + * q == [PowerSum(p, w**j) for 0 <= j < len] + * where PowerSum(p, w**j) == sum(p[i] * w[i]**j for 0 <= i < len) + * and where roots == [w[i] for 0 <= i < len] + */ +static void t_dft_t_weighted_power_sums(nn_ptr q, nn_srcptr p, nn_ptr roots, ulong len, nmod_t mod) +{ + // initially w**0 == [1,..,1]: + nn_ptr w_pow_j = _nmod_vec_init(len); + for (ulong i = 0; i < len; i++) + w_pow_j[i] = 1; + + for (ulong j = 0; j < len; j++) + { + // at this stage, w_pow_j holds [w[i]**j for 0 <= i < len] + q[j] = 0; + for (ulong i = 0; i < len; i++) + { + q[j] = nmod_add(q[j], + nmod_mul(p[i], w_pow_j[i], mod), + mod); + w_pow_j[i] = nmod_mul(w_pow_j[i], roots[i], mod); + } + } + _nmod_vec_clear(w_pow_j); +} + +TEST_FUNCTION_START(n_fft_dft_t, state) +{ + int i; + + for (i = 0; i < 200 * flint_test_multiplier(); i++) + { + // take some FFT prime p with max_depth >= 10 + ulong max_depth, prime; + + // half of tests == fixed large prime, close to limit + // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1 + // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1 + if (i > 100) +#if FLINT_BITS == 64 + prime = UWORD(4611686018427322369); +#else // FLINT_BITS == 32 + prime = UWORD(1073479681); +#endif + else + { + max_depth = MAX_EVAL_DEPTH + n_randint(state, 6); + prime = 1 + (UWORD(1) << max_depth); + while (! n_is_prime(prime)) + prime += (UWORD(1) << max_depth); + } + max_depth = flint_ctz(prime-1); + + nmod_t mod; + nmod_init(&mod, prime); + + // init FFT root tables + n_fft_ctx_t F; + n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime); + + // retrieve roots + nn_ptr roots = flint_malloc((UWORD(1) << MAX_EVAL_DEPTH) * sizeof(ulong)); + for (ulong k = 0; k < (UWORD(1) << (MAX_EVAL_DEPTH-1)); k++) + { + roots[2*k] = F->tab_w[2*k]; + roots[2*k+1] = prime - F->tab_w[2*k]; // < prime since F->tab_w[2*k] != 0 + } + + for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++) + { + const ulong len = (UWORD(1) << depth); + + // construct random array of length len + nn_ptr p = _nmod_vec_init(len); + for (ulong k = 0; k < len; k++) + p[k] = n_randint(state, prime); + // copy it before in-place transform + ulong * q = _nmod_vec_init(len); + _nmod_vec_set(q, p, len); + + // naive weighted power sums + t_dft_t_weighted_power_sums(q, p, roots, len, mod); + + // transposed DFT + n_fft_dft_t(p, depth, F); + + int res = _nmod_vec_equal(p, q, len); + + if (!res) + TEST_FUNCTION_FAIL( + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "failed equality test\n", + prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); + + _nmod_vec_clear(p); + _nmod_vec_clear(q); + } + + flint_free(roots); + n_fft_ctx_clear(F); + } + + TEST_FUNCTION_END(state); +} + +#undef MAX_EVAL_DEPTH diff --git a/src/n_fft/test/t-idft.c b/src/n_fft/test/t-idft.c new file mode 100644 index 0000000000..b1085e7590 --- /dev/null +++ b/src/n_fft/test/t-idft.c @@ -0,0 +1,107 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "flint.h" +#include "test_helpers.h" +#include "ulong_extras.h" +#include "nmod.h" +#include "nmod_poly.h" +#include "nmod_vec.h" +#include "n_fft.h" + +#define MAX_EVAL_DEPTH 10 + +TEST_FUNCTION_START(n_fft_idft, state) +{ + int i; + + for (i = 0; i < 200 * flint_test_multiplier(); i++) + { + // take some FFT prime p with max_depth >= 10 + ulong max_depth, prime; + + // half of tests == fixed large prime, close to limit + // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1 + // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1 + if (i > 100) +#if FLINT_BITS == 64 + prime = UWORD(4611686018427322369); +#else // FLINT_BITS == 32 + prime = UWORD(1073479681); +#endif + else + { + max_depth = MAX_EVAL_DEPTH + n_randint(state, 6); + prime = 1 + (UWORD(1) << max_depth); + while (! n_is_prime(prime)) + prime += (UWORD(1) << max_depth); + } + max_depth = flint_ctz(prime-1); + + nmod_t mod; + nmod_init(&mod, prime); + + // init FFT root tables + n_fft_ctx_t F; + n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime); + + // retrieve roots, used later for multipoint evaluation + nn_ptr roots = flint_malloc((UWORD(1) << MAX_EVAL_DEPTH) * sizeof(ulong)); + for (ulong k = 0; k < (UWORD(1) << (MAX_EVAL_DEPTH-1)); k++) + { + roots[2*k] = F->tab_w[2*k]; + roots[2*k+1] = prime - F->tab_w[2*k]; // < prime since F->tab_w[2*k] != 0 + } + + for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++) + { + const ulong len = (UWORD(1) << depth); + + // choose random evals of degree == len + nn_ptr p = flint_malloc(len * sizeof(ulong)); + for (ulong k = 0; k < len; k++) + p[k] = n_randint(state, prime); + + // general interpolation + nmod_poly_t pol; + nmod_poly_init(pol, prime); + nmod_poly_interpolate_nmod_vec(pol, roots, p, len); + + // interpolate via IDFT + n_fft_idft(p, depth, F); + + int res = _nmod_vec_equal(pol->coeffs, p, len); + + if (!res) + { + _nmod_vec_print(p, len, mod); + _nmod_vec_print(pol->coeffs, len, mod); + TEST_FUNCTION_FAIL( + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "failed equality test\n", + prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); + } + + _nmod_vec_clear(p); + nmod_poly_clear(pol); + } + + flint_free(roots); + n_fft_ctx_clear(F); + } + + TEST_FUNCTION_END(state); +} + +#undef MAX_EVAL_DEPTH diff --git a/src/n_fft/test/t-idft_t.c b/src/n_fft/test/t-idft_t.c new file mode 100644 index 0000000000..b4a0cb1bf2 --- /dev/null +++ b/src/n_fft/test/t-idft_t.c @@ -0,0 +1,96 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "flint.h" +#include "test_helpers.h" +#include "ulong_extras.h" +#include "nmod.h" +#include "nmod_vec.h" +#include "n_fft.h" + +#define MAX_EVAL_DEPTH 13 + +TEST_FUNCTION_START(n_fft_idft_t, state) +{ + int i; + + for (i = 0; i < 1000 * flint_test_multiplier(); i++) + { + // take some FFT prime p with max_depth >= 10 + ulong max_depth, prime; + + // half of tests == fixed large prime, close to limit + // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1 + // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1 + if (i > 100) +#if FLINT_BITS == 64 + prime = UWORD(4611686018427322369); +#else // FLINT_BITS == 32 + prime = UWORD(1073479681); +#endif + else + { + max_depth = MAX_EVAL_DEPTH + n_randint(state, 6); + prime = 1 + (UWORD(1) << max_depth); + while (! n_is_prime(prime)) + prime += (UWORD(1) << max_depth); + } + max_depth = flint_ctz(prime-1); + + nmod_t mod; + nmod_init(&mod, prime); + + // init FFT root tables + n_fft_ctx_t F; + n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime); + + for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++) + { + const ulong len = (UWORD(1) << depth); + + // construct random array of length len + nn_ptr p = _nmod_vec_init(len); + for (ulong k = 0; k < len; k++) + p[k] = n_randint(state, prime); + // copy it before in-place transform + nn_ptr q = _nmod_vec_init(len); + _nmod_vec_set(q, p, len); + + // apply idft_t + n_fft_idft_t(p, depth, F); + // apply dft_t + n_fft_dft_t(p, depth, F); + + // check dft_t o idft_t == 1 + int res = _nmod_vec_equal(p, q, len); + + if (!res) + { + TEST_FUNCTION_FAIL( + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "failed equality test\n", + prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); + } + + _nmod_vec_clear(p); + _nmod_vec_clear(q); + } + + n_fft_ctx_clear(F); + } + + TEST_FUNCTION_END(state); +} + +#undef MAX_EVAL_DEPTH diff --git a/src/n_fft/test/t-init.c b/src/n_fft/test/t-init.c new file mode 100644 index 0000000000..30449469c6 --- /dev/null +++ b/src/n_fft/test/t-init.c @@ -0,0 +1,163 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "ulong_extras.h" +#include "n_fft.h" + +// return bit reversal index of k for given nbits: +// e.g. br_index([0,1,2,3], 4) == [0, 8, 4, 12] +static inline ulong br_index(ulong k, ulong nbits) +{ + k = ((k >> 1) & 0x55555555) | ((k & 0x55555555) << 1); + k = ((k >> 2) & 0x33333333) | ((k & 0x33333333) << 2); + k = ((k >> 4) & 0x0F0F0F0F) | ((k & 0x0F0F0F0F) << 4); + k = ((k >> 8) & 0x00FF00FF) | ((k & 0x00FF00FF) << 8); + k = ( k >> 16 ) | ( k << 16); +#if FLINT_BITS == 64 + k = ( k >> 32 ) | ( k << 32); +#endif // FLINT_BITS == 64 + + return k >> (FLINT_BITS - nbits); +} + +int test_one(n_fft_ctx_t F, ulong max_depth, ulong depth, ulong p, flint_rand_t state) +{ + // if depth < 3, init is supposed to behave as if depth == 3 + depth = FLINT_MAX(3, depth); + + // check all basic attributes + if (F->mod != p) + return 1; + + if (F->max_depth != max_depth) + return 2; + + if ((1 + (F->cofactor << max_depth)) != p) + return 3; + + if (F->depth != depth) + return 4; + + // retrieve primitive root and its inverse + const ulong w = F->tab_w2[2*(max_depth-2)]; + const ulong iw = n_invmod(w, p); + + // check the primitive root + if (n_powmod2(w, UWORD(1)<tab_w2[2*k]; + if (w2 != n_powmod2(w, UWORD(1)<<(max_depth-2-k), p)) + return 6; + if (F->tab_w2[2*k+1] != n_mulmod_precomp_shoup(w2, p)) + return 7; + } + + // check all entries of tab_inv2 + for (ulong k = 0; k < max_depth; k++) + { + ulong inv2 = F->tab_inv2[2*k]; + if (inv2 != n_invmod((UWORD(1)<<(k+1)), p)) + return 8; + if (F->tab_inv2[2*k+1] != n_mulmod_precomp_shoup(inv2, p)) + return 9; + } + + // check a few random entries of tab_w and tab_iw + for (ulong j = 0; j < 1000; j++) + { + ulong k = n_randint(state, UWORD(1) << (F->depth - 1)); + ulong exp = br_index(k, F->max_depth - 1); + + ulong wk = F->tab_w[2*k]; + if (wk != n_powmod2(w, exp, p)) + return 10; + if (F->tab_w[2*k+1] != n_mulmod_precomp_shoup(wk, p)) + return 11; + + ulong iwk = F->tab_iw[2*k]; + if (iwk != n_powmod2(iw, exp, p)) + return 12; + if (F->tab_iw[2*k+1] != n_mulmod_precomp_shoup(iwk, p)) + return 13; + } + + return 0; +} + +TEST_FUNCTION_START(n_fft_ctx_init2, state) +{ + int i; + + for (i = 0; i < 1000 * flint_test_multiplier(); i++) + { + ulong p, max_depth; + if (i % 20 != 0) + { + // take random prime in [17, 2**(FLINT_BITS-2)) +#if FLINT_BITS == 64 + ulong bits = 5 + n_randint(state, 58); +#else + ulong bits = 5 + n_randint(state, 25); +#endif + p = n_randprime(state, bits, 1); + max_depth = flint_ctz(p-1); + + // we need p such that 8 divides p-1 + while (max_depth < 3) + { + p = n_randprime(state, bits, 1); + max_depth = flint_ctz(p-1); + } + } + else + { + // the above will most often have max_depth 3 or 4 + // every now and then we want p with larger max_depth +#if FLINT_BITS == 64 + max_depth = 40 + n_randint(state, 10); +#else + max_depth = 10 + n_randint(state, 10); +#endif + p = 1 + (UWORD(1) << max_depth); + while (! n_is_prime(p)) + p += (UWORD(1) << max_depth); + max_depth = flint_ctz(p-1); + } + + // take depth between 0 and min(12, max_depth) + ulong depth = n_randint(state, FLINT_MIN(12, max_depth)); + + // init + n_fft_ctx_t F; + n_fft_ctx_init2(F, depth, p); + + int res = test_one(F, max_depth, depth, p, state); + + if (res) + TEST_FUNCTION_FAIL( + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "error code = %wu\n", + p, F->tab_w2[2*(max_depth-2)], max_depth, depth, res); + + n_fft_ctx_clear(F); + } + + TEST_FUNCTION_END(state); +} diff --git a/src/nmod_vec/profile/p-dot.c b/src/nmod_vec/profile/p-dot.c index 6d226710be..217f715704 100644 --- a/src/nmod_vec/profile/p-dot.c +++ b/src/nmod_vec/profile/p-dot.c @@ -9,9 +9,9 @@ (at your option) any later version. See . */ -#include #include // for atoi +#include "ulong_extras.h" #include "profiler.h" #include "nmod.h" #include "nmod_vec.h" diff --git a/src/ulong_extras/profile/p-powmod.c b/src/ulong_extras/profile/p-powmod.c new file mode 100644 index 0000000000..0a8e00c10e --- /dev/null +++ b/src/ulong_extras/profile/p-powmod.c @@ -0,0 +1,152 @@ +/* + Copyright 2024 (C) Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . + */ + +#include "profiler.h" +#include "ulong_extras.h" +#include "double_extras.h" + +#define NB_ITER 1000 + +typedef struct +{ + ulong bits; + ulong exp; +} info_t; + + +void sample_preinv(void * arg, ulong count) +{ + info_t * info = (info_t *) arg; + ulong exp = info->exp; + ulong bits = info->bits; + nn_ptr array = (nn_ptr) flint_malloc(NB_ITER*sizeof(ulong)); + FLINT_TEST_INIT(state); + + for (ulong i = 0; i < count; i++) + { + ulong n = n_randbits(state, bits); // 0 < n < 2**(FLINT_BITS) + ulong ninv = n_preinvert_limb(n); + ulong norm = flint_clz(n); + + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_randint(state, n); // 0 <= array[j] < n + + prof_start(); + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_powmod_ui_preinv(array[j], exp, n, ninv, norm); + prof_stop(); + } + + flint_free(array); + FLINT_TEST_CLEAR(state); +} + +void sample_preinv2(void * arg, ulong count) +{ + info_t * info = (info_t *) arg; + ulong exp = info->exp; + ulong bits = info->bits; + nn_ptr array = (nn_ptr) flint_malloc(NB_ITER*sizeof(ulong)); + FLINT_TEST_INIT(state); + + for (ulong i = 0; i < count; i++) + { + ulong n = n_randbits(state, bits); // 0 < n < 2**(FLINT_BITS) + ulong ninv = n_preinvert_limb(n); + + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_randlimb(state); + + prof_start(); + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_powmod2_ui_preinv(array[j], exp, n, ninv); + prof_stop(); + } + + flint_free(array); + FLINT_TEST_CLEAR(state); +} + +void sample_precomp(void * arg, ulong count) +{ + info_t * info = (info_t *) arg; + ulong exp = info->exp; + ulong bits = info->bits; + nn_ptr array = (nn_ptr) flint_malloc(NB_ITER*sizeof(ulong)); + FLINT_TEST_INIT(state); + + for (ulong i = 0; i < count; i++) + { + ulong n = n_randbits(state, bits); // 0 < n < 2**bits + double ninv = n_precompute_inverse(n); + + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_randint(state, n); // 0 <= array[j] < n + + prof_start(); + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_powmod_ui_precomp(array[j], exp, n, ninv); + prof_stop(); + } + + flint_free(array); + FLINT_TEST_CLEAR(state); +} + +int main(void) +{ + double min, max; + + const ulong bits_nb = 5; + ulong bits_list[] = {20, 30, 50, 60, 64}; + const ulong exp_nb = 11; + ulong exp_list[] = {5, 10, 20, 40, 80, 160, 1000, 10000, 100000, 1000000L, 10000000L}; + + flint_printf("compute an exponentiation a**e mod n, with nbits(n) = b\n"); + flint_printf(" computation is repeated on the element of a %wu-length array\n"); + flint_printf(" time is divided by %wu * FLINT_CLOCK_SCALE_FACTOR * log_2(exp)\n", NB_ITER, NB_ITER); + flint_printf("timings are: powmod_ui_precomp | powmod_ui_preinv | powmod2_ui_preinv\n"); + flint_printf("b \\ e\t"); + for (ulong e = 0; e < exp_nb; e++) + flint_printf("%wu\t\t", exp_list[e]); + flint_printf("\n"); + + info_t info; + + for (ulong b = 0; b < bits_nb; b++) + { + info.bits = bits_list[b]; + flint_printf("%wu\t", info.bits); + + for (ulong e = 0; e < exp_nb; e++) + { + info.exp = exp_list[e]; + double log_exp = d_log2((double)info.exp); + + if (info.bits <= 53) + { + prof_repeat(&min, &max, sample_precomp, (void *) &info); + flint_printf("%4.1f|", min/(NB_ITER * FLINT_CLOCK_SCALE_FACTOR * log_exp)); + } + else + flint_printf(" na |"); + + prof_repeat(&min, &max, sample_preinv, (void *) &info); + flint_printf("%4.1f|", min/(NB_ITER * FLINT_CLOCK_SCALE_FACTOR * log_exp)); + + prof_repeat(&min, &max, sample_preinv2, (void *) &info); + flint_printf("%4.1f\t", min/(NB_ITER * FLINT_CLOCK_SCALE_FACTOR * log_exp)); + } + flint_printf("\n"); + } + + return 0; +}