diff --git a/Makefile.in b/Makefile.in
index 0f25aa2b56..7ac241a637 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -193,7 +193,8 @@ HEADER_DIRS :=                                                              \
         fmpz_mod_mpoly_factor           fmpq_mpoly_factor                   \
         fq_nmod_mpoly_factor            fq_zech_mpoly_factor                \
                                                                             \
-        fft             @FFT_SMALL@     fmpz_poly_q     fmpz_lll            \
+        fft             n_fft		@FFT_SMALL@                         \
+	fmpz_poly_q     fmpz_lll            				    \
         n_poly          arith           qsieve          aprcl               \
                                                                             \
         nf              nf_elem         qfb                                 \
diff --git a/src/n_fft.h b/src/n_fft.h
new file mode 100644
index 0000000000..0df2674ff1
--- /dev/null
+++ b/src/n_fft.h
@@ -0,0 +1,234 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#ifndef N_FFT_H
+#define N_FFT_H
+
+#include "flint.h"
+
+#define N_FFT_CTX_DEFAULT_DEPTH 12
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * TODO[long term] large depth can lead to heavy memory usage
+ *              --> provide precomputation-free functions
+ *
+ * TODO[long term] avx512 vectorization
+ *
+ * TODO[long term] on zen4 (likely on other cpus as well) ctx_init becomes
+ * slower at some point, losing a factor 4 or more; this is expected due to
+ * memory aspects but arises rather early, in fact the depth where it becomes
+ * slower is significantly smaller (~13-14) when tab_iw has been incorporated
+ * compared to without tab_iw (it was depth ~20-21); see if this can be
+ * understood, and maybe play with vectorization for those simple functions
+ */
+
+
+/*-------------------------------------------------*/
+/* STRUCTURES FOR FFT CONTEXT / FUNCTION ARGUMENTS */
+/*-------------------------------------------------*/
+
+
+/** n_fft context:
+ *     - basic parameters
+ *     - precomputed powers of the primitive root of unity and its inverse
+ *     - precomputed inverses of 2**k
+ *
+ * Requirements (not checked upon init):
+ *     - mod is an odd prime < 2**(FLINT_BITS-2)
+ *     - max_depth must be >= 3 (so, 8 must divide mod - 1)
+ * Total memory cost of precomputations for arrays tab_{w,iw,w2,inv2}:
+ *     at most 2 * (2*FLINT_BITS + 2**depth) ulong's
+ *
+ * For more details about the content of tab_{w,iw,w2,inv2}, see comments below
+ **/
+typedef struct
+{
+    ulong mod;                    // modulus, odd prime
+    ulong max_depth;              // maximum supported depth (w has order 2**max_depth)
+    ulong cofactor;               // prime is 1 + cofactor * 2**max_depth
+    ulong depth;                  // depth supported by current precomputation
+    nn_ptr tab_w;                 // precomputed powers of w
+    nn_ptr tab_iw;                // precomputed powers of 1/w
+    ulong tab_w2[2*FLINT_BITS];   // precomputed powers w**(2**k)
+    ulong tab_inv2[2*FLINT_BITS]; // precomputed inverses of 2**k
+} n_fft_ctx_struct;
+typedef n_fft_ctx_struct n_fft_ctx_t[1];
+
+
+/** n_fft arguments:
+ *     - modulus mod
+ *     - its double 2*mod (storing helps for speed)
+ *     - precomputed powers of w
+ * To be used as an argument in FFT functions. In some parts, providing this
+ * instead of the whole context increased performance. Also, this facilitate
+ * using the same function with both tab_w and tab_iw (by forming an fft_args
+ * with Fargs->tab_w = F->tab_iw.
+ **/
+typedef struct
+{
+    ulong mod;                 // modulus, odd prime
+    ulong mod2;                // 2*mod
+    nn_srcptr tab_w;           // tabulated powers of w, see below
+} n_fft_args_struct;
+typedef n_fft_args_struct n_fft_args_t[1];
+
+
+/** tab_w2:
+ *    - length 2*FLINT_BITS, with undefined entries at index 2*(max_depth-1) and beyond
+ *    - contains powers w**d for d a power of 2, and corresponding
+ *    precomputations for modular multiplication:
+ *       -- for 0 <= k < max_depth-1, tab_w2[2*k] = w**(2**(max_depth-2-k))
+ *          and tab_w2[2*k+1] = floor(tab_w2[2*k] * 2**FLINT_BITS / mod)
+ *       -- for 2*(max_depth-1) <= k < 2*FLINT_BITS, tab_w2[k] is undefined
+ *
+ * --> one can retrieve w as tab_w2[2 * (max_depth-2)]
+ * --> the first elements are tab_w2 = [I, I_pr, J, J_pr, ...]
+ * where I is a square root of -1 and J is a square root of I
+ */
+
+/** tab_w:
+ *     - length 2**depth
+ *     - contains 2**(depth-1) first powers of w in (max_depth-1)-bit reversed order,
+ *     and corresponding precomputations for modular multiplication:
+ *        -- for 0 <= k < 2**(depth-1), tab_w[2*k] = w**(br[k])
+ *           and tab_w[2*k+1] = floor(tab_w[2*k] * 2**FLINT_BITS / mod)
+ *  where br = [0, 2**(max_depth-2), 2**(max_depth-3), 3 * 2**(max_depth-3), ...]
+ *  is the bit reversal permutation of length 2**(max_depth-1)
+ *  (https://en.wikipedia.org/wiki/Bit-reversal_permutation)
+ *
+ * In particular the first elements are
+ *      tab_w = [1, 1_pr, I, I_pr, J, J_pr, IJ, IJ_pr, ...]
+ * where I is a square root of -1, J is a square root of I, and IJ = I*J. Note
+ * that powers of w beyond 2**(max_depth-1), for example -1, -I, -J, etc. are
+ * not stored.
+ **/
+
+/** tab_iw: same as tab_w but for the primitive root 1/w */
+
+/** tab_inv2:
+ *     - length 2*FLINT_BITS, with undefined entries at index 2*max_depth and beyond
+ *     - contains the modular inverses of 2**k, and corresponding
+ *    precomputations for modular multiplication:
+ *       -- for 0 <= k < max_depth, tab_inv2[2*k] = the inverse of 2**(k+1)
+ *       modulo mod, and tab_inv2[2*k+1] = floor(tab_inv2[2*k] * 2**FLINT_BITS / mod)
+ *       -- for 2*max_depth <= k < 2*FLINT_BITS, tab_inv2[k] is undefined
+ * 
+ * Recall F->mod == 1 + cofactor * 2**max_depth, so
+ *          1 == F->mod - cofactor * 2**(max_depth - k) * 2**k
+ * --> the inverse of 2**k in (0, F->mod) is
+ *          F->mod - cofactor * 2**(max_depth - k),
+ * we do not really need to store it, but we want the precomputations as well
+ */
+
+
+/*------------------------------------------*/
+/* PRECOMPUTATIONS / CONTEXT INITIALIZATION */
+/*------------------------------------------*/
+
+/** Note for init functions, when depth is provided:
+ *   - if it is < 3, it is pretended that it is 3
+ *   - it it is more than F->max_depth (the maximum possible with the given
+ *   prime), it is reduced to F->max_depth
+ * After calling init, precomputations support DFTs of length up to 2**depth
+ */
+
+/* initialize with given root and given depth */
+void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong cofactor, ulong depth, ulong mod);
+
+/* find primitive root, initialize with given depth */
+void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p);
+
+/* same, with default depth */
+FLINT_FORCE_INLINE
+void n_fft_ctx_init_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong cofactor, ulong p)
+{ n_fft_ctx_init2_root(F, w, max_depth, cofactor, N_FFT_CTX_DEFAULT_DEPTH, p); }
+
+FLINT_FORCE_INLINE
+void n_fft_ctx_init(n_fft_ctx_t F, ulong p)
+{ n_fft_ctx_init2(F, N_FFT_CTX_DEFAULT_DEPTH, p); }
+
+/* grows F->depth and precomputations to support DFTs of depth up to depth */
+void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth);
+
+void n_fft_ctx_clear(n_fft_ctx_t F);
+
+FLINT_FORCE_INLINE
+void n_fft_set_args(n_fft_args_t F, ulong mod, nn_srcptr tab_w)
+{
+    F->mod = mod;
+    F->mod2 = 2*mod;
+    F->tab_w = tab_w;
+}
+
+/*-----------------------------*/
+/* DFT / IDFT / DFT_t / IDFT_t */
+/*-----------------------------*/
+
+/** forward and inverse transforms, and their transposes:
+ *    - length is a power of 2, len == 2**depth
+ *    - requirement of all functions (not checked): depth <= F.depth
+ *    - the comments below describe algorithms that modify the input array p in
+ *    place: in these comments p stands for the input p, whereas q stands
+ *    for the array p after running the algorithm
+ *    - below in comments we write w[k] for 0 <= k < len/2, defined as
+ *            w[2*k]   == F->tab_w[2*k]
+ *            w[2*k+1] == - F->tab_w[2*k]
+ *    - hence the list w[k] for 0 <= k < len gives the len roots of the
+ *    polynomial x**len - 1, which are all powers of the chosen len-th
+ *    primitive root of unity, with exponents listed in bit reversed order
+ *    - the matrix of DFT of length len is the len x len matrix
+ *             DFT_{w,len} = [ w[i]**j ]_{0 <= i, j < len}
+ */
+
+/** dft: discrete Fourier transform (q = DFT_{w,len} * p)
+ * In-place transform p = [p[j] for 0 <= j < len], seen as a polynomial p(x) of
+ * degree < len, into its evaluations
+ *     q == [p(w[i])  for 0 <= i < len],
+ * where p(w[i]) = sum(p[j] * w[i]**j for 0 <= j < len)
+ */
+
+/** idft: inverse discrete Fourier transform (q = DFT_{w,len}^{-1} * p)
+ * In-place transform p = [p[i] for 0 <= i < len] into the list of coefficients
+ * q = [q[j] for 0 <= j < len] of the unique polynomial q(x) of degree < len
+ * such that p[i] == q(w[i])  for 0 <= i < len
+ */
+
+/** dft_t: transposed discrete Fourier transform (q = p * DFT_{w,len})
+ * In-place transform p = [p[i] for 0 <= i < len] into the list of weighted
+ * power sums
+ *        q == [PowerSum(p, w**j) for 0 <= j < len]
+ * where PowerSum(p, w**j) == sum(p[i] * w[i]**j for 0 <= i < len)
+ */
+
+/** idft_t: transposed inverse discrete Fourier transform (q = p * DFT_{w,len}^{-1})
+ * In-place transform p = [p[j] for 0 <= j < len] into the coefficients q =
+ * [q[i] for 0 <= i < len] which appear in the partial fraction decomposition
+ *      p(x) = sum_{0 <= i < len} q[i] / (1 - w[i] * x) + O(x**len)
+ * where p(x) is the power series p(x) = sum_{0 <= j < len} p[j] x**j  + O(x**len)
+ */
+
+void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F);
+
+void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F);
+
+void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F);
+
+void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* N_FFT_H */
diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c
new file mode 100644
index 0000000000..faba87e3da
--- /dev/null
+++ b/src/n_fft/ctx_init.c
@@ -0,0 +1,175 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "n_fft.h"
+#include "ulong_extras.h"  /* for mulmod_shoup* functions */
+
+/** Given the precomputed quotient a_pr for modular multiplication by a mod n,
+ *          a_pr == floor(a * 2**FLINT_BITS / n)
+ * where we assume 0 < a < n and n does not divide a * 2**FLINT_BITS,
+ * this returns the quotient for mulmod by -a mod n,
+ *          floor( (n-a) * 2**FLINT_BITS / n)
+ *          == 2**FLINT_BITS - ceil(a * 2**FLINT_BITS / n)
+ *          == 2**FLINT_BITS - a_pr
+ *
+ * Note: the requirement "n does not divide a * 2**FLINT_BITS" follows
+ * from the other requirement 0 < a < n as soon as n is odd; in n_fft.h
+ * we will only use this for odd primes
+ */
+FLINT_FORCE_INLINE ulong n_mulmod_precomp_shoup_negate(ulong a_pr)
+{
+    return UWORD_MAX - a_pr;
+}
+
+void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong cofactor, ulong depth, ulong p)
+{
+    if (depth < 3)
+        depth = 3;
+    if (max_depth < depth)
+        depth = max_depth;
+
+    // fill basic attributes
+    F->mod = p;
+    F->max_depth = max_depth;
+    F->cofactor = cofactor;
+    F->depth = 3;  // to be able to call fit_depth below
+
+    // fill tab_w2
+    ulong pr_quo, pr_rem, ww;
+    ww = w;
+    n_mulmod_precomp_shoup_quo_rem(&pr_quo, &pr_rem, ww, p);
+    F->tab_w2[2*(max_depth-2)] = ww;
+    F->tab_w2[2*(max_depth-2)+1] = pr_quo;
+    for (slong k = max_depth-3; k >= 0; k--)
+    {
+        // ww <- ww**2 and its precomputed quotient
+        n_mulmod_and_precomp_shoup(&ww, &pr_quo, ww, ww, pr_quo, pr_rem, pr_quo, p);
+        pr_rem = n_mulmod_precomp_shoup_rem_from_quo(pr_quo, p);
+        F->tab_w2[2*k] = ww;
+        F->tab_w2[2*k+1] = pr_quo;
+    }
+    // at this stage, pr_quo and pr_rem are for k == 0 i.e. for I == tab_w2[0]
+
+    // fill tab_inv2
+    for (ulong k = 0; k < max_depth; k++)
+    {
+        F->tab_inv2[2*k] = p - (cofactor << (max_depth - k-1));
+        F->tab_inv2[2*k+1] = n_mulmod_precomp_shoup(F->tab_inv2[2*k], p);
+    }
+
+    // fill tab_w and tab_iw for depth 3
+    ulong len = UWORD(1) << (depth-1);  // len >= 4
+    F->tab_w = (nn_ptr) flint_malloc(2*len * sizeof(ulong));
+    F->tab_iw = (nn_ptr) flint_malloc(2*len * sizeof(ulong));
+
+    // w**0 == iw**0 == 1
+    F->tab_w[0] = UWORD(1);
+    F->tab_w[1] = n_mulmod_precomp_shoup(UWORD(1), p);
+    F->tab_iw[0] = UWORD(1);
+    F->tab_iw[1] = F->tab_w[1];
+
+    // w**(L/4) == I and iw**(L/4) == -I, L == 2**max_depth
+    F->tab_w[2] = F->tab_w2[0];
+    F->tab_w[3] = F->tab_w2[1];
+    F->tab_iw[2] = p - F->tab_w2[0];
+    F->tab_iw[3] = n_mulmod_precomp_shoup_negate(F->tab_w2[1]);
+
+    // w**(L/8) == J and w**(3L/8) == I*J
+    F->tab_w[4] = F->tab_w2[2];
+    F->tab_w[5] = F->tab_w2[3];
+    n_mulmod_and_precomp_shoup(F->tab_w+6, F->tab_w+7, F->tab_w2[0], F->tab_w2[2], pr_quo, pr_rem, F->tab_w2[3], p);
+
+    // iw**(L/8) == -I*J and iw**(3L/8) == -J
+    F->tab_iw[4] = p - F->tab_w[6];
+    F->tab_iw[5] = n_mulmod_precomp_shoup_negate(F->tab_w[7]);
+    F->tab_iw[6] = p - F->tab_w[4];
+    F->tab_iw[7] = n_mulmod_precomp_shoup_negate(F->tab_w[5]);
+
+    // complete tab_w up to specified depth
+    n_fft_ctx_fit_depth(F, depth);
+}
+
+void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p)
+{
+    FLINT_ASSERT(p > 2 && flint_clz(p) >= 2);    // 2 < p < 2**(FLINT_BITS-2)
+    FLINT_ASSERT(flint_ctz(p - UWORD(1)) >= 3);  // p-1 divisible by 8
+
+    // find the constant and exponent such that p == c * 2**max_depth + 1
+    const ulong max_depth = flint_ctz(p - UWORD(1));
+    const ulong cofactor = (p - UWORD(1)) >> max_depth;
+
+    // find primitive root w of order 2**max_depth
+    const ulong prim_root = n_primitive_root_prime(p);
+    const ulong w = n_powmod2(prim_root, cofactor, p);
+
+    // fill all attributes and tables
+    n_fft_ctx_init2_root(F, w, max_depth, cofactor, depth, p);
+}
+
+void n_fft_ctx_clear(n_fft_ctx_t F)
+{
+    flint_free(F->tab_w);
+    flint_free(F->tab_iw);
+}
+
+void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth)
+{
+    if (F->max_depth < depth)
+        depth = F->max_depth;
+
+    if (depth > F->depth)
+    {
+        ulong len = UWORD(1) << (depth-1);  // len >= 8 (since depth >= 4)
+        F->tab_w = flint_realloc(F->tab_w, 2*len * sizeof(ulong));
+        F->tab_iw = flint_realloc(F->tab_iw, 2*len * sizeof(ulong));
+
+        // tab_w[2] is w**(L/8) * tab_w[0], where L = 2**max_depth,
+        // tab_w[2*4,2*6] is w**(L/16) * tab_w[2*0,2*2],
+        // tab_w[2*8,2*10,2*12,2*14] is w**(L/32) * tab_w[2*0,2*2,2*4,2*6], etc.
+        // recall tab_w2[2*k] == w**(L / 2**(k+2))
+        ulong d = F->depth - 1;
+        ulong llen = UWORD(1) << (F->depth-1);
+        ulong ww, pr_quo, pr_rem;
+        for ( ; llen < len; llen <<= 1, d += 1)
+        {
+            ww = F->tab_w2[2*d];
+            pr_quo = F->tab_w2[2*d+1];
+            pr_rem = n_mulmod_precomp_shoup_rem_from_quo(pr_quo, F->mod);
+            // for each k, tab_w[2*(k+llen)] <- ww * tab_w[2*k], and deduce precomputation
+            for (ulong k = 0; k < llen; k+=4)
+            {
+                n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+0), F->tab_w + 2*llen + 2*(k+0)+1,
+                                           ww, F->tab_w[2*(k+0)],
+                                           pr_quo, pr_rem, F->tab_w[2*(k+0)+1], F->mod);
+                n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+1), F->tab_w + 2*llen + 2*(k+1)+1,
+                                           ww, F->tab_w[2*(k+1)],
+                                           pr_quo, pr_rem, F->tab_w[2*(k+1)+1], F->mod);
+                n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+2), F->tab_w + 2*llen + 2*(k+2)+1,
+                                           ww, F->tab_w[2*(k+2)],
+                                           pr_quo, pr_rem, F->tab_w[2*(k+2)+1], F->mod);
+                n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+3), F->tab_w + 2*llen + 2*(k+3)+1,
+                                           ww, F->tab_w[2*(k+3)],
+                                           pr_quo, pr_rem, F->tab_w[2*(k+3)+1], F->mod);
+
+                F->tab_iw[2*llen + 2*(llen-1-(k+0))] = F->mod - F->tab_w[2*llen + 2*(k+0)];
+                F->tab_iw[2*llen + 2*(llen-1-(k+0)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+0)+1]);
+                F->tab_iw[2*llen + 2*(llen-1-(k+1))] = F->mod - F->tab_w[2*llen + 2*(k+1)];
+                F->tab_iw[2*llen + 2*(llen-1-(k+1)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+1)+1]);
+                F->tab_iw[2*llen + 2*(llen-1-(k+2))] = F->mod - F->tab_w[2*llen + 2*(k+2)];
+                F->tab_iw[2*llen + 2*(llen-1-(k+2)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+2)+1]);
+                F->tab_iw[2*llen + 2*(llen-1-(k+3))] = F->mod - F->tab_w[2*llen + 2*(k+3)];
+                F->tab_iw[2*llen + 2*(llen-1-(k+3)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+3)+1]);
+            }
+        }
+
+        F->depth = depth;
+    }
+}
diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c
new file mode 100644
index 0000000000..772edbd3b0
--- /dev/null
+++ b/src/n_fft/dft.c
@@ -0,0 +1,295 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "n_fft.h"
+#include "n_fft_macros.h"
+
+/** Structure.
+ * - The main interface is n_fft_dft, it solves the problem at node 0
+ *   (evaluating at all roots of unity of order 2**depth), as documented
+ *   in n_fft.h.
+ * - The core function is `dft_node_lazy_4_4`, which goes down the subproduct
+ *   tree from an arbitrary node in this tree; it takes input values in [0..4n)
+ *   and return values in [0..4n), following the idea of lazy butterflies
+ *   highlighted by David Harvey [Faster arithmetic for number-theoretic
+ *   transforms, Journal of Symbolic Computation, Volume 60, 2014, pp 113-119].
+ * - This core function costs more than a DFT at node 0, at least for small or
+ *   smallish lengths. So a specific function for node 0 is given
+ *   (`dft_lazy_1_4`), targeting input values in [0..n) and return values in
+ *   [0..4n) (it iself uses a similar function `dft_lazy_2_4`). The main
+ *   function `n_fft_dft` just calls `dft_lazy_1_4` and then reduces the output
+ *   to [0..n).
+ */
+
+/** Example for nodes/depth:
+ *   if F.depth is 3, the tree of roots of unity in F->tab_w is
+ *                    1                               d3n0                <-- depth 3 
+ *               /        \                        /        \
+ *             1            -1                 d2n0          d2n1         <-- depth 2
+ *           /   \        /     \     =        /   \        /     \
+ *         1     -1      I      -I         d1n0   d1n1   d1n2    d1n3     <-- depth 1
+ *        / \    / \    / \    /  \         / \    / \    / \    /  \
+ *       1  -1  I  -I  J  -J  IJ -IJ       1  -1  I  -I  J  -J  IJ -IJ    <-- depth 0
+ *  stored as, ommitting precomputations:
+ *    F->tab_w == [1, 1_pr, I, I_pr, J, J_pr, IJ, IJ_pr]
+ *  (the elements -1, -I, -J, -IJ are not stored)
+ *
+ *
+ *  -> calling a function with depth==3 and node==0 is performing
+ *  evaluation at all these 8 points (8th roots of 1)
+ *  -> calling a function with depth==2 and node==0 is performing
+ *  evaluation at all points at the leaves of the left child d2n0
+ *  of the root of the tree d3n0 (4th roots of 1)
+ *  -> calling a function with depth==2 and node==1 is performing
+ *  evaluation at all points at the leaves of the right child d2n1
+ *  of d3n0 (4th roots of -1)
+ *  -> calling a function with depth==1 and node==1 is performing
+ *  evaluation at all points at the leaves of the subtree rooted
+ *  at d1n1 (square roots of -1)
+ *  -> calling a function with depth==1 and node==2 is performing
+ *  evaluation at all points at the leaves of the subtree rooted
+ *  at d1n2 (square roots of I)
+ */
+
+/*-----------------------*/
+/*  auxiliary functions  */
+/*-----------------------*/
+
+/** 2**depth-point DFT, general node
+ * * In-place transform p of length len == 2**depth, seen as a polynomial of
+ * degree < len, into the concatenation of all polynomial evaluations
+ *          [p(w_k), p(-w_k)] for k in range(len),
+ * where w_k = F->tab_w[2**depth * node + 2*k] for 0 <= k < 2**(depth-1)
+ * * By construction these evaluation points are the len roots of the
+ * polynomial x**len - F->tab_w[2*node] (for example, if depth=
+ * * Requirements (not checked):
+ *        3 <= depth
+ *        (node+1) * 2**depth < 2**F.depth (length of F->tab_w)
+ * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n
+ */
+void dft_node_lazy_4_4(nn_ptr p, ulong depth, ulong node, n_fft_args_t F)
+{
+    if (depth == 3)
+    {
+        DFT8_NODE_LAZY_4_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], node, F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth == 4)
+    {
+        DFT16_NODE_LAZY_4_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                            p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15],
+                            node, F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth == 5)
+    {
+        DFT32_NODE_LAZY_4_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                            p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15],
+                            p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23],
+                            p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31],
+                            node, F->mod, F->mod2, F->tab_w);
+    }
+    else
+    {
+        const ulong len = UWORD(1) << depth;
+
+        // 4-point butterflies
+        // in: [0..4n), out: [0..4n)
+        const nn_ptr p0 = p;
+        const nn_ptr p1 = p+len/4;
+        const nn_ptr p2 = p+2*len/4;
+        const nn_ptr p3 = p+3*len/4;
+        const ulong w2 = F->tab_w[2*node];
+        const ulong w2pre = F->tab_w[2*node+1];
+        const ulong w = F->tab_w[4*node];
+        const ulong wpre = F->tab_w[4*node+1];
+        const ulong Iw = F->tab_w[4*node+2];
+        const ulong Iwpre = F->tab_w[4*node+3];
+
+        for (ulong k = 0; k < len/4; k+=4)
+        {
+            DFT4_NODE_LAZY_4_4(p0[k+0], p1[k+0], p2[k+0], p3[k+0], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2);
+            DFT4_NODE_LAZY_4_4(p0[k+1], p1[k+1], p2[k+1], p3[k+1], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2);
+            DFT4_NODE_LAZY_4_4(p0[k+2], p1[k+2], p2[k+2], p3[k+2], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2);
+            DFT4_NODE_LAZY_4_4(p0[k+3], p1[k+3], p2[k+3], p3[k+3], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2);
+        }
+
+        // 4 recursive calls with depth-2
+        dft_node_lazy_4_4(p0, depth-2, 4*node, F);
+        dft_node_lazy_4_4(p1, depth-2, 4*node+1, F);
+        dft_node_lazy_4_4(p2, depth-2, 4*node+2, F);
+        dft_node_lazy_4_4(p3, depth-2, 4*node+3, F);
+    }
+}
+
+/** 2**depth-point DFT
+ * Same specification as n_fft_dft, except for:
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ *             requirement (not checked): depth <= F.depth
+ * * lazy_2_4: in [0..2n) / out [0..4n) / max < 4n
+ *             requirement (not checked): 3 <= depth <= F.depth
+ */
+void dft_lazy_2_4(nn_ptr p, ulong depth, n_fft_args_t F)
+{
+    if (depth == 3)
+    {
+        DFT8_LAZY_2_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth == 4)
+    {
+        DFT16_LAZY_2_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                       p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15],
+                       F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth == 5)
+    {
+        DFT32_LAZY_2_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                       p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15],
+                       p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23],
+                       p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31],
+                       F->mod, F->mod2, F->tab_w);
+    }
+    else
+    {
+        const ulong len = UWORD(1) << depth;
+
+        // 4-point butterflies
+        // input p0,p1,p2,p3 in [0..2n) x [0..2n) x [0..2n) x [0..2n)
+        // output p0,p1,p2,p3 in [0..2n) x [0..4n) x [0..4n) x [0..4n)
+        const nn_ptr p0 = p;
+        const nn_ptr p1 = p + len/4;
+        const nn_ptr p2 = p + 2*len/4;
+        const nn_ptr p3 = p + 3*len/4;
+        for (ulong k = 0; k < len/4; k++)
+        {
+            DFT4_LAZY_2_4(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2);
+            if (p0[k] >= F->mod2)
+                p0[k] -= F->mod2;
+        }
+
+        // 4 recursive calls with depth-2
+        dft_lazy_2_4(p0, depth-2, F);
+        dft_node_lazy_4_4(p1, depth-2, 1, F);
+        dft_node_lazy_4_4(p2, depth-2, 2, F);
+        dft_node_lazy_4_4(p3, depth-2, 3, F);
+    }
+}
+
+void dft_lazy_1_4(nn_ptr p, ulong depth, n_fft_args_t F)
+{
+    if (depth == 4)
+    {
+        DFT16_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                       p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15],
+                       F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth == 5)
+    {
+        DFT32_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                       p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15],
+                       p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23],
+                       p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31],
+                       F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth > 5)
+    {
+        const ulong len = UWORD(1) << depth;
+
+        // 4-point butterflies
+        // input p0,p1,p2,p3 in [0..n) x [0..n) x [0..n) x [0..n)
+        // output p0,p1,p2,p3 in [0..2n) x [0..4n) x [0..4n) x [0..4n)
+        const nn_ptr p0 = p;
+        const nn_ptr p1 = p + len/4;
+        const nn_ptr p2 = p + 2*len/4;
+        const nn_ptr p3 = p + 3*len/4;
+        for (ulong k = 0; k < len/4; k++)
+        {
+            DFT4_LAZY_1_4(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2);
+            if (p0[k] >= F->mod2)
+                p0[k] -= F->mod2;
+        }
+
+        // 4 recursive calls with depth-2
+        dft_lazy_2_4(p0, depth-2, F);
+        dft_node_lazy_4_4(p1, depth-2, 1, F);
+        dft_node_lazy_4_4(p2, depth-2, 2, F);
+        dft_node_lazy_4_4(p3, depth-2, 3, F);
+    }
+    else if (depth == 3)
+    {
+        DFT8_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth == 2)
+    {
+        DFT4_LAZY_1_4(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2);
+    }
+    else if (depth == 1)
+    {
+        DFT2_LAZY_1_2(p[0], p[1], F->mod);
+    }
+}
+
+/*-------------------*/
+/*  main interfaces  */
+/*-------------------*/
+
+void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F)
+{
+    if (depth > 0)
+    {
+        n_fft_args_t Fargs;
+        n_fft_set_args(Fargs, F->mod, F->tab_w);
+        dft_lazy_1_4(p, depth, Fargs);
+        for (ulong k = 0; k < (UWORD(1) << depth); k++)
+        {
+            if (p[k] >= Fargs->mod2)
+                p[k] -= Fargs->mod2;
+            if (p[k] >= Fargs->mod)
+                p[k] -= Fargs->mod;
+        }
+    }
+}
+
+void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F)
+{
+    if (depth > 0)
+    {
+        n_fft_args_t Fargs;
+        n_fft_set_args(Fargs, F->mod, F->tab_iw);
+        dft_lazy_1_4(p, depth, Fargs);
+
+        // see comments in idft concerning this loop
+        const ulong inv2 = F->tab_inv2[2*depth-2];
+        const ulong inv2_pr = F->tab_inv2[2*depth-1];
+        for (ulong k = 0; k < (UWORD(1) << depth); k++)
+            p[k] = n_mulmod_shoup(inv2, p[k], inv2_pr, F->mod);
+    }
+}
+
+/*---------------*/
+/* some comments */
+/*---------------*/
+
+/** In n_fft_idft_t, there is apparently no gain from using the lazy
+ * mulmod_shoup variant whose output is in [0..2n) (so one may as well use the
+ * non-lazy one which ensures output < n)              
+ */
+
+/** Lazier variants for DFT with general node:
+ * - lazy_1_4 variants would be basically identical to the lazy_2_4 variants (see the macros)
+ * - writing lazy_2_4 variants of the DFTxx_NODE_LAZY_4_4 macros and then of
+ * dft_node_lazy_4_4 brings almost no speedup (very marginal gain up to length
+ * 32 or 64, nothing observable beyond this)
+ */
+
+/** Base cases:
+ * - having macros for "small" lengths (up to 16 or 32 at least) improves performance
+ * - removing the base cases depth==3 in internal functions where this case is
+ *   not really used (eg dft_node_lazy_4_4) does not make a difference
+ */
diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c
new file mode 100644
index 0000000000..f5c503686c
--- /dev/null
+++ b/src/n_fft/idft.c
@@ -0,0 +1,212 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "n_fft.h"
+#include "n_fft_macros.h"
+
+/** Structure.
+ * - The main interface is n_fft_idft, it solves the problem at node 0
+ *   (interpolating at all roots of unity of order 2**depth), as documented in
+ *   n_fft.h.
+ * - The core function is `idft_node_lazy_1_2`, which goes up the subproduct
+ *   tree towards an arbitrary node in this tree; it takes input values in
+ *   [0..n) and return values in [0..2n), following the idea of lazy
+ *   butterflies highlighted by David Harvey [Faster arithmetic for
+ *   number-theoretic transforms, Journal of Symbolic Computation, Volume 60,
+ *   2014, pp 113-119]. This function does not scale the output by the inverse
+ *   of 2**depth.
+ * - This core function costs more than a iDFT at node 0, at least for small or
+ *   smallish lengths. So a specific function for node 0 is given
+ *   (`idft_lazy_1_4`), targeting input values in [0..n) and return values in
+ *   [0..4n). The main function `n_fft_idft` just calls `idft_lazy_1_4`, and
+ *   then scales the output value by the inverse of 2**depth, also ensuring the
+ *   output is in [0..n).
+ */
+
+/*************************
+*  auxiliary functions  *
+*************************/
+
+/** 2**depth-point inverse DFT, general node
+ * * In-place transform p = [p[i] for 0 <= i < len], where len == 2**depth,
+ * into the list of coefficients q = [q[j] for 0 <= j < len] of the unique
+ * polynomial q(x) of degree < len such that p[i] == q(w[i])  for 0 <= i < len
+ * * Here we write w[k] for 0 <= k < len/2, defined as
+ *            w[2*k]   == F->tab_w[2**depth * node + 2*k]
+ *            w[2*k+1] == - F->tab_w[2**depth * node + 2*k];
+ * these are the len roots of the polynomial x**len - F->tab_w[2*node]
+ * * Requirements (not checked):
+ *        3 <= depth
+ *        (node+1) * 2**depth < 2**F.depth (length of F->tab_w)
+ * * lazy_1_2: in [0..n) / out [0..2n) / max < 4n
+ */
+void idft_node_lazy_1_2(nn_ptr p, ulong depth, ulong node, n_fft_args_t F)
+{
+    if (depth == 3)
+    {
+        IDFT8_NODE_LAZY_1_2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                            node, F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth == 4)
+    {
+        IDFT16_NODE_LAZY_1_2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                             p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15],
+                             node, F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth == 5)
+    {
+        IDFT32_NODE_LAZY_1_2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                             p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15],
+                             p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23],
+                             p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31],
+                             node, F->mod, F->mod2, F->tab_w);
+    }
+    else
+    {
+        const ulong len = UWORD(1) << depth;
+
+        // 4 recursive calls with depth-2
+        const nn_ptr p0 = p;
+        const nn_ptr p1 = p + len/4;
+        const nn_ptr p2 = p + 2*len/4;
+        const nn_ptr p3 = p + 3*len/4;
+        idft_node_lazy_1_2(p0, depth-2, 4*node, F);
+        idft_node_lazy_1_2(p1, depth-2, 4*node+1, F);
+        idft_node_lazy_1_2(p2, depth-2, 4*node+2, F);
+        idft_node_lazy_1_2(p3, depth-2, 4*node+3, F);
+
+        const ulong w2 = F->tab_w[2*node];
+        const ulong w2_pr = F->tab_w[2*node+1];
+        const ulong w = F->tab_w[4*node];
+        const ulong w_pr = F->tab_w[4*node+1];
+        const ulong Iw = F->tab_w[4*node+2];
+        const ulong Iw_pr = F->tab_w[4*node+3];
+
+        for (ulong k = 0; k < len/4; k+=4)
+        {
+            IDFT4_NODE_LAZY_2_2(p0[k+0], p1[k+0], p2[k+0], p3[k+0], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2);
+            IDFT4_NODE_LAZY_2_2(p0[k+1], p1[k+1], p2[k+1], p3[k+1], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2);
+            IDFT4_NODE_LAZY_2_2(p0[k+2], p1[k+2], p2[k+2], p3[k+2], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2);
+            IDFT4_NODE_LAZY_2_2(p0[k+3], p1[k+3], p2[k+3], p3[k+3], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2);
+        }
+    }
+}
+
+/** 2**depth-point inverse DFT
+ * Same specification as n_fft_idft, except that the
+ * output values are in [0..4n)
+ */
+void idft_lazy_1_4(nn_ptr p, ulong depth, n_fft_args_t F)
+{
+    if (depth == 0)
+        return;
+
+    if (depth == 1)
+    {
+        DFT2_LAZY_1_2(p[0], p[1], F->mod);
+    }
+    else if (depth == 2)
+    {
+        IDFT4_LAZY_1_4(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3],
+                           F->mod, F->mod2);
+    }
+    else
+    if (depth == 3)
+    {
+        IDFT8_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                       F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth == 4)
+    {
+        IDFT16_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                        p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15],
+                        F->mod, F->mod2, F->tab_w);
+    }
+    else if (depth == 5)
+    {
+        IDFT32_LAZY_1_4(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+                        p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15],
+                        p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23],
+                        p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31],
+                        F->mod, F->mod2, F->tab_w);
+    }
+    else
+    {
+        const ulong len = UWORD(1) << depth;
+
+        // 4 recursive calls with depth-2
+        const nn_ptr p0 = p;
+        const nn_ptr p1 = p + len/4;
+        const nn_ptr p2 = p + 2*len/4;
+        const nn_ptr p3 = p + 3*len/4;
+        idft_lazy_1_4(p0, depth-2, F);
+        idft_node_lazy_1_2(p1, depth-2, 1, F);
+        idft_node_lazy_1_2(p2, depth-2, 2, F);
+        idft_node_lazy_1_2(p3, depth-2, 3, F);
+
+        // 4-point butterflies
+        // input p0 in [0,4n), p1,p2,p3 in [0,2n)
+        // output p0,p1,p2,p3 in [0,4n)
+        for (ulong k = 0; k < len/4; k+=4)
+        {
+            IDFT4_LAZY_4222_4(p0[k+0], p1[k+0], p2[k+0], p3[k+0], F->tab_w[2], F->tab_w[3], F->mod, F->mod2);
+            IDFT4_LAZY_4222_4(p0[k+1], p1[k+1], p2[k+1], p3[k+1], F->tab_w[2], F->tab_w[3], F->mod, F->mod2);
+            IDFT4_LAZY_4222_4(p0[k+2], p1[k+2], p2[k+2], p3[k+2], F->tab_w[2], F->tab_w[3], F->mod, F->mod2);
+            IDFT4_LAZY_4222_4(p0[k+3], p1[k+3], p2[k+3], p3[k+3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2);
+        }
+    }
+}
+
+
+/*-------------------*/
+/*  main interfaces  */
+/*-------------------*/
+
+void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F)
+{
+    if (depth > 0)
+    {
+        n_fft_args_t Fargs;
+        n_fft_set_args(Fargs, F->mod, F->tab_w);
+        idft_lazy_1_4(p, depth, Fargs);
+        for (ulong k = 0; k < (UWORD(1) << depth); k++)
+        {
+            if (p[k] >= Fargs->mod2)
+                p[k] -= Fargs->mod2;
+            if (p[k] >= Fargs->mod)
+                p[k] -= Fargs->mod;
+        }
+    }
+}
+
+void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F)
+{
+    if (depth > 0)
+    {
+        n_fft_args_t Fargs;
+        n_fft_set_args(Fargs, F->mod, F->tab_iw);
+        idft_lazy_1_4(p, depth, Fargs);
+
+        const ulong inv2 = F->tab_inv2[2*depth-2];
+        const ulong inv2_pr = F->tab_inv2[2*depth-1];
+        for (ulong k = 0; k < (UWORD(1) << depth); k++)
+            p[k] = n_mulmod_shoup(inv2, p[k], inv2_pr, F->mod);
+    }
+}
+
+/*---------------*/
+/* some comments */
+/*---------------*/
+
+/** In n_fft_idft, there is apparently no gain from using the lazy mulmod_shoup
+ * variant whose output is in [0..2n) (so one may as well use the non-lazy one
+ * which ensures output < n)              
+ */
diff --git a/src/n_fft/n_fft_macros.h b/src/n_fft/n_fft_macros.h
new file mode 100644
index 0000000000..c2b33e922b
--- /dev/null
+++ b/src/n_fft/n_fft_macros.h
@@ -0,0 +1,997 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#ifndef N_FFT_MACROS_H
+#define N_FFT_MACROS_H
+
+#include "longlong.h"      /* for umul_ppmm */
+#include "ulong_extras.h"  /* for mulmod_shoup* functions */
+
+/*---------*/
+/* helpers */
+/*---------*/
+
+/** Shoup's modular multiplication with precomputation, lazy
+ * (does not perform the excess correction step)
+ *  --> computes either r or r+n and store it is res, where r = (a*b) % n
+ *  --> a_pr is the precomputation for n, p_hi and p_lo are temporaries
+ */
+#define N_MULMOD_PRECOMP_LAZY(res, a, b, a_pr, n)             \
+do {                                                          \
+    ulong p_hi, p_lo;                                         \
+    umul_ppmm(p_hi, p_lo, (a_pr), (b));                       \
+    res = (a) * (b) - p_hi * (n);                             \
+} while(0)
+
+/*------------------*/
+/* length 2, node 0 */
+/*------------------*/
+
+/** Butterfly radix 2
+ * * In-place transform:                    [1  1]
+ *                         [a  b] <- [a  b] [1 -1]
+ * * n is the modulus, n2 is 2*n
+ * * lazy_1_2:    in [0..n) / out [0..2n) / max < 2n
+ * * lazy_22_24:  in [0..2n) x [0..2n) / out [0..2n) x [0..4n) / max < 4n
+ * * lazy_42_44:  in [0..4n) x [0..2n) / out [0..4n) x [0..4n) / max < 4n
+ */
+#define DFT2_LAZY_1_2(a, b, n) \
+do {                           \
+    ulong tmp;                 \
+    tmp = (b);                 \
+    (b) = (a) + (n) - tmp;     \
+    (a) = (a) + tmp;           \
+} while(0)
+
+#define DFT2_LAZY_22_24(a, b, n2) \
+do {                              \
+    ulong tmp;                    \
+    tmp = (b);                    \
+    (b) = (a) + (n2) - tmp;       \
+    (a) = (a) + tmp;              \
+    if ((a) >= (n2))              \
+        (a) -= (n2);              \
+} while(0)
+
+#define DFT2_LAZY_42_44(a, b, n2)          \
+do {                                       \
+    ulong tmp;                             \
+    tmp = (a);                             \
+    if (tmp >= (n2))                       \
+        tmp -= (n2);         /* [0..2n) */ \
+    (a) = tmp + (b);         /* [0..4n) */ \
+    (b) = tmp + (n2) - (b);  /* [0..4n) */ \
+} while(0)
+
+/*----------------------------------------------*/
+/* length 2, general node                       */
+/* (Cooley-Tukey & Gentleman-Sande butterflies) */
+/*----------------------------------------------*/
+
+/** Cooley-Tukey butterfly:
+ * * In-place transform
+ *                            [1  1]
+ *           [a  b] <- [a  b] [w -w]
+ * * n2 is 2*n, w_pr is the precomputed data for multiplication by w mod n
+ * * can be seen as evaluation at points w and -w of a+b*x
+ * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n
+ */
+#define DFT2_NODE_LAZY_4_4(a, b, w, w_pr, n, n2) \
+do {                                             \
+    ulong u, v;                                  \
+    u = (a);                                     \
+    if (u >= (n2))                               \
+        u -= (n2);  /* [0..2n) */                \
+    v = (b);                                     \
+    N_MULMOD_PRECOMP_LAZY(v, w, v, w_pr, n);     \
+    (a) = u + v;                                 \
+    (b) = u + (n2) - v;                          \
+} while(0)
+
+/** Gentleman-Sande butterfly:
+ * * In-place transform
+ *                            [1  w]
+ *           [a  b] <- [a  b] [1 -w]
+ * * n2 is 2*n, w_pr is the precomputed data for multiplication by w mod n
+ * * can be seen as degree-1 interpolation at points iw = 1 / w and -iw, up to
+ * a scaling by 1/2, since the inverse of [1  w] is  1/2 * [ 1   1]
+ *                                        [1 -w]           [iw -iw]
+ * * lazy_22: in [0..2n) / out [0..2n) / max < 4n
+ */
+#define IDFT2_NODE_LAZY_2_2(a, b, w, w_pr,        \
+                            n, n2)                \
+do {                                              \
+    ulong tmp;                                    \
+    tmp = (a) + (n2) - (b);     /* [0..4n) */     \
+    (a) = (a) + (b);            /* [0..4n) */     \
+    if ((a) >= (n2))                              \
+        (a) -= (n2);            /* [0..2n) */     \
+    N_MULMOD_PRECOMP_LAZY((b), w, tmp, w_pr, n);  \
+                        /* --> (b) in [0..2n) */  \
+} while(0)
+
+/*------------------*/
+/* length 4, node 0 */
+/*------------------*/
+
+/** 4-point FFT evaluation
+ * * In-place transform
+ *                              [1  1  1  1]
+ *                              [1 -1  I -I]
+ * [a  b  c  d] <- [a  b  c  d] [1  1 -1 -1]
+ *                              [1 -1 -I  I]
+ *                              [1  0  1  0] [1  1  0  0] 
+ *              == [a  b  c  d] [0  1  0  I] [1 -1  0  0] 
+ *                              [1  0 -1  0] [0  0  1  1] 
+ *                              [0  1  0 -I] [0  0  1 -1] 
+ * * Corresponds to reducing down the tree with nodes
+ *                       x^4 - 1
+ *                     /         \
+ *             x^2 - 1             x^2 + 1
+ *             /     \             /     \
+ *         x - 1     x + 1     x - I     x + I
+ *  where I is typically a square root of -1
+ *  (but this property is not exploited)
+ * * n is the modulus, n2 is 2*n
+ *   I_pr is the precomputed data for multiplication by I mod n
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ * * lazy_2_4: in [0..2n) / out [0..4n) / max < 4n
+ */
+#define DFT4_LAZY_1_4(a, b, c, d, I, I_pr, n, n2)               \
+do {                                                            \
+    const ulong v0 = (a);                                       \
+    const ulong v1 = (b);                                       \
+    const ulong v2 = (c);                                       \
+    const ulong v3 = (d);                                       \
+    ulong v4 = v0 + v2;                     /* < 2*n */         \
+    ulong v5 = v0 + (n) - v2;               /* < 2*n */         \
+    ulong v6 = v1 + v3;                     /* < 2*n */         \
+    ulong v7;                                                   \
+    N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n) - v3, (I_pr), (n)); \
+    (a) = v4 + v6;                          /* < 4*n */         \
+    (b) = v4 + (n2) - v6;                   /* < 4*n */         \
+    (c) = v5 + v7;                          /* < 4*n */         \
+    (d) = v5 + (n2) - v7;                   /* < 4*n */         \
+} while(0)
+
+#define DFT4_LAZY_2_4(a, b, c, d, I, I_pr, n, n2)                 \
+do {                                                              \
+    const ulong v0 = (a);                                         \
+    const ulong v1 = (b);                                         \
+    const ulong v2 = (c);                                         \
+    const ulong v3 = (d);                                         \
+    ulong v4 = v0 + v2;                      /* < 4*n */          \
+    if (v4 >= (n2))                                               \
+        v4 -= (n2);                          /* < 2*n */          \
+    ulong v5 = v0 + (n2) - v2;               /* < 4*n */          \
+    if (v5 >= (n2))                                               \
+        v5 -= (n2);                          /* < 2*n */          \
+    ulong v6 = v1 + v3;                      /* < 4*n */          \
+    if (v6 >= (n2))                                               \
+        v6 -= (n2);                          /* < 2*n */          \
+    ulong v7;                                                     \
+    N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n2) - v3, (I_pr), (n));  \
+    (a) = v4 + v6;                           /* < 4*n */          \
+    (b) = v4 + (n2) - v6;                    /* < 4*n */          \
+    (c) = v5 + v7;                           /* < 4*n */          \
+    (d) = v5 + (n2) - v7;                    /* < 4*n */          \
+} while(0)
+
+/** 4-point FFT interpolation
+ * * In-place transform
+ *                              [1  1  1  1]
+ *                              [1 -1  1 -1]
+ * [a  b  c  d] <- [a  b  c  d] [1 -I -1  I]
+ *                              [1  I -1 -I]
+ *                              [1  1  0  0] [1  0  1  0]
+ *              == [a  b  c  d] [1 -1  0  0] [0  1  0  1]
+ *                              [0  0  1  I] [1  0 -1  0]
+ *                              [0  0  1 -I] [0  1  0 -1]
+ *
+ * * If I**2 == -1, this matrix is the inverse of the one above; this
+ * corresponds to interpolation at 1, -1, I, -I, up to scaling by 1/4; or to
+ * going up the tree with nodes
+ *                       x^4 - 1
+ *                     /         \
+ *             x^2 - 1             x^2 + 1
+ *             /     \             /     \
+ *         x - 1     x + 1     x - I     x + I
+ * * n is the modulus, n2 is 2*n
+ *   I_pr is the precomputed data for multiplication by I mod n
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ * * lazy_4222_4: a in [0..4n), b,c,d in [0..2n) / out [0..4n) / max < 4n
+ */
+#define IDFT4_LAZY_1_4(a, b, c, d, I, I_pr, n, n2)               \
+do {                                                             \
+    const ulong v0 = (a);                                        \
+    const ulong v1 = (b);                                        \
+    const ulong v2 = (c);                                        \
+    const ulong v3 = (d);                                        \
+    ulong v4 = v0 + v1;                         /* < 2*n */      \
+    ulong v5 = v0 + (n) - v1;                   /* < 2*n */      \
+    ulong v6 = v2 + v3;                         /* < 2*n */      \
+    ulong v7;                                                    \
+    N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n) - v3, (I_pr), (n));  \
+    (a) = v4 + v6;                              /* < 4*n */      \
+    (b) = v5 + v7;                              /* < 4*n */      \
+    (c) = v4 + (n2) - v6;                       /* < 4*n */      \
+    (d) = v5 + (n2) - v7;                       /* < 4*n */      \
+} while(0)
+
+#define IDFT4_LAZY_4222_4(a, b, c, d, I, I_pr, n, n2)             \
+do {                                                              \
+    ulong v0 = (a);                                               \
+    const ulong v1 = (b);                                         \
+    const ulong v2 = (c);                                         \
+    const ulong v3 = (d);                                         \
+    if (v0 >= (n2))                                               \
+        v0 -= (n2);                             /* < 2*n */       \
+    ulong v4 = v0 + v1;                         /* < 4*n */       \
+    if (v4 >= (n2))                                               \
+        v4 -= (n2);                             /* < 2*n */       \
+    ulong v5 = v0 + (n2) - v1;                  /* < 4*n */       \
+    if (v5 >= (n2))                                               \
+        v5 -= (n2);                             /* < 2*n */       \
+    ulong v6 = v2 + v3;                         /* < 4*n */       \
+    if (v6 >= (n2))                                               \
+        v6 -= (n2);                             /* < 2*n */       \
+    ulong v7;                                                     \
+    N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n2) - v3, (I_pr), (n));  \
+    (a) = v4 + v6;                              /* < 4*n */       \
+    (b) = v5 + v7;                              /* < 4*n */       \
+    (c) = v4 + (n2) - v6;                       /* < 4*n */       \
+    (d) = v5 + (n2) - v7;                       /* < 4*n */       \
+} while(0)
+
+/*------------------------*/
+/* length 4, general node */
+/*------------------------*/
+
+/** 4-point FFT, evaluation, from general node
+ * * In-place transform
+ *                              [ 1          1       1       1]
+ *                              [w2        -w2      w3     -w3]
+ * [a  b  c  d] <- [a  b  c  d] [w1         w1     -w1     -w1]
+ *                              [w1*w2  -w1*w2  -w1*w3   w1*w3]
+ * * Corresponds to reducing down the tree with nodes
+ *                        x^4 - w1**2
+ *                      /             \
+ *             x^2 - w1                 x^2 + w1
+ *             /      \                 /      \
+ *        x - w2      x + w2       x - w3      x + w3
+ * typically w2**2 == w1 and w3 == I*w2 (hence w3**2 == -w1) so that the above
+ * is a Vandermonde matrix and this tree really is the subproduct tree built
+ * from the four roots w2, -w2, I*w2, -I*w2 of x**4 - w1
+ * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n
+ */
+#define DFT4_NODE_LAZY_4_4(a, b, c, d,                        \
+                           w1, w1_pr, w2, w2_pr, w3, w3_pr,   \
+                           n, n2)                             \
+do {                                                          \
+    ulong tmp;                                                \
+    ulong u0 = (a);                                           \
+    ulong u1 = (b);                                           \
+    ulong u2 = (c);                                           \
+    ulong u3 = (d);                                           \
+    if (u0 >= n2)                                             \
+        u0 -= n2;                                             \
+    if (u1 >= n2)                                             \
+        u1 -= n2;                                             \
+                                                              \
+    N_MULMOD_PRECOMP_LAZY(u2, w1, u2, w1_pr, n);              \
+    tmp = u0;                                                 \
+    u0 = u0 + u2;                    /* [0..4n) */            \
+    u2 = tmp + n2 - u2;              /* [0..4n) */            \
+    if (u0 >= n2)                                             \
+        u0 -= n2;                    /* [0..2n) */            \
+    if (u2 >= n2)                                             \
+        u2 -= n2;                    /* [0..2n) */            \
+                                                              \
+    N_MULMOD_PRECOMP_LAZY(u3, w1, u3, w1_pr, n);              \
+    tmp = u1;                                                 \
+    u1 = u1 + u3;                    /* [0..4n) */            \
+    u3 = tmp + n2 - u3;              /* [0..4n) */            \
+                                                              \
+    N_MULMOD_PRECOMP_LAZY(u1, w2, u1, w2_pr, n);              \
+    (a) = u0 + u1;                   /* [0..4n) */            \
+    (b) = u0 + n2 - u1;              /* [0..4n) */            \
+                                                              \
+    N_MULMOD_PRECOMP_LAZY(u3, w3, u3, w3_pr, n);              \
+    (c) = u2 + u3;                    /* [0..4n) */           \
+    (d) = u2 + n2 - u3;              /* [0..4n) */            \
+} while(0)
+
+/** 4-point FFT, interpolation, general node
+ * * In-place transform
+ *                              [ 1   iw2   iw1    iw1*iw2]
+ *                              [ 1  -iw2   iw1   -iw1*iw2]
+ * [a  b  c  d] <- [a  b  c  d] [ 1   iw3  -iw1   -iw1*iw3]
+ *                              [ 1  -iw3  -iw1    iw1*iw3]
+ *                              [1  iw2  0    0] [1   0   w1   0]
+ *              == [a  b  c  d] [1 -iw2  0    0] [0   1    0  w1]
+ *                              [0    0  1  iw3] [1   0  -w1   0]
+ *                              [0    0  1 -iw3] [0   1    0 -w1]
+ * * Corresponds, up to scaling by 1/4, to going up the tree with nodes
+ *                        x^4 - w1**2
+ *                      /             \
+ *             x^2 - w1                 x^2 + w1
+ *             /      \                 /      \
+ *        x - w2      x + w2       x - w3      x + w3
+ * typically w2**2 == w1 and w3 == I*w2 (hence w3**2 == -w1) so that the above
+ * is the inverse of a Vandermonde matrix and this tree really is the
+ * subproduct tree built from the four roots w2, -w2, I*w2, -I*w2 of x**4 - w1
+ * * lazy_1_2: in [0..n) / out [0..2n) / max < 4n
+ * * lazy_2_2: in [0..2n) / out [0..2n) / max < 4n
+ */
+#define IDFT4_NODE_LAZY_2_2(a, b, c, d,                              \
+                            w1, w1_pr, w2, w2_pr, w3, w3_pr,         \
+                            n, n2)                                   \
+do {                                                                 \
+    const ulong v0 = (a);                                            \
+    const ulong v1 = (b);                                            \
+    const ulong v2 = (c);                                            \
+    const ulong v3 = (d);                                            \
+    ulong v4 = v0 + v1;                       /* < 4*n */            \
+    if (v4 >= (n2))                                                  \
+        v4 -= (n2);                           /* < 2*n */            \
+    ulong v5;                                                        \
+    N_MULMOD_PRECOMP_LAZY(v5, (w2), v0 + (n2) - v1, (w2_pr), (n));   \
+    ulong v6 = v2 + v3;                       /* < 4*n */            \
+    if (v6 >= (n2))                                                  \
+        v6 -= (n2);                           /* < 2*n */            \
+    ulong v7;                                                        \
+    N_MULMOD_PRECOMP_LAZY(v7, (w3), v2 + (n2) - v3, (w3_pr), (n));   \
+                                                                     \
+    (a) = v4 + v6;                                                   \
+    if ((a) >= (n2))                                                 \
+        (a) -= (n2);                           /* < 2*n */           \
+    (b) = v5 + v7;                                                   \
+    if ((b) >= (n2))                                                 \
+        (b) -= (n2);                           /* < 2*n */           \
+    N_MULMOD_PRECOMP_LAZY((c), (w1), v4 + (n2) - v6, (w1_pr), (n));  \
+    N_MULMOD_PRECOMP_LAZY((d), (w1), v5 + (n2) - v7, (w1_pr), (n));  \
+} while(0)
+
+#define IDFT4_NODE_LAZY_1_2(a, b, c, d,                              \
+                            w1, w1_pr, w2, w2_pr, w3, w3_pr,         \
+                            n, n2)                                   \
+do {                                                                 \
+    const ulong v0 = (a);                                            \
+    const ulong v1 = (b);                                            \
+    const ulong v2 = (c);                                            \
+    const ulong v3 = (d);                                            \
+    ulong v4 = v0 + v1;                       /* < 2*n */            \
+    ulong v5;                                                        \
+    N_MULMOD_PRECOMP_LAZY(v5, (w2), v0 + (n) - v1, (w2_pr), (n));    \
+    ulong v6 = v2 + v3;                       /* < 2*n */            \
+    ulong v7;                                                        \
+    N_MULMOD_PRECOMP_LAZY(v7, (w3), v2 + (n) - v3, (w3_pr), (n));    \
+                                                                     \
+    (a) = v4 + v6;                            /* < 4*n */            \
+    if ((a) >= (n2))                                                 \
+        (a) -= (n2);                           /* < 2*n */           \
+    (b) = v5 + v7;                            /* < 4*n */            \
+    if ((b) >= (n2))                                                 \
+        (b) -= (n2);                           /* < 2*n */           \
+    N_MULMOD_PRECOMP_LAZY((c), (w1), v4 + (n2) - v6, (w1_pr), (n));  \
+    N_MULMOD_PRECOMP_LAZY((d), (w1), v5 + (n2) - v7, (w1_pr), (n));  \
+} while(0)
+
+/*------------------*/
+/* length 8, node 0 */
+/*------------------*/
+
+/** 8-point FFT, evaluation
+ * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial
+ * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations
+ *       p(1), p(-1), p(I), p(-I), p(J), p(-J), p(I*J), p(-I*J)
+ * i.e. the evaluations at all 8-th roots of unity J**k for 0 <= k < 8 in
+ * bit-reversed order
+ * * Recall: [F->tab_w[2*k] for k in range(4)] == [1, I, J, IJ])
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ * * lazy_2_4: in [0..2n) / out [0..4n) / max < 4n
+ */
+#define DFT8_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7, \
+                      n, n2, tab_w)                   \
+do {                                                  \
+    DFT2_LAZY_1_2(p0, p4, n);                         \
+    DFT2_LAZY_1_2(p1, p5, n);                         \
+    DFT2_LAZY_1_2(p2, p6, n);                         \
+    DFT2_LAZY_1_2(p3, p7, n);                         \
+                                                      \
+    DFT4_LAZY_2_4(p0, p1, p2, p3,                     \
+                tab_w[2], tab_w[3],                   \
+                n, n2);                               \
+    /* could use a lazy_2_4 variant of the  */        \
+    /* next one, but the gain is negligible */        \
+    DFT4_NODE_LAZY_4_4(p4, p5, p6, p7,                \
+                     tab_w[2], tab_w[3],              \
+                     tab_w[4], tab_w[5],              \
+                     tab_w[6], tab_w[7],              \
+                     n, n2);                          \
+} while(0)
+
+#define DFT8_LAZY_2_4(p0, p1, p2, p3, p4, p5, p6, p7,  \
+                      n, n2, tab_w)                    \
+do {                                                   \
+    DFT2_LAZY_22_24(p0, p4, n2);                       \
+    DFT2_LAZY_22_24(p1, p5, n2);                       \
+    DFT2_LAZY_22_24(p2, p6, n2);                       \
+    DFT2_LAZY_22_24(p3, p7, n2);                       \
+                                                       \
+    DFT4_LAZY_2_4(p0, p1, p2, p3,                      \
+                tab_w[2], tab_w[3],                    \
+                n, n2);                                \
+    DFT4_NODE_LAZY_4_4(p4, p5, p6, p7,                 \
+                     tab_w[2], tab_w[3],               \
+                     tab_w[4], tab_w[5],               \
+                     tab_w[6], tab_w[7],               \
+                     n, n2);                           \
+} while(0)
+
+/** 8-point FFT, interpolation
+ * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as the evaluations
+ *       [p(1), p(-1), p(I), p(-I), p(J), p(-J), p(I*J), p(-I*J)]
+ * at all 8-th roots of unity J**k for 0 <= k < 8 in bit-reversed order
+ * of a polynomial p(x) of degree < 8, into the coefficients of this polynomial 
+ * * Recall: [F->tab_w[2*k] for k in range(4)] == [1, I, J, IJ])
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ */
+#define IDFT8_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7, \
+                       n, n2, tab_w)                   \
+do {                                                   \
+    IDFT4_LAZY_1_4(p0, p1, p2, p3,                     \
+                   tab_w[2], tab_w[3],                 \
+                   n, n2);                             \
+    IDFT4_NODE_LAZY_1_2(p4, p5, p6, p7,                \
+                tab_w[2], tab_w[3],                    \
+                tab_w[4], tab_w[5],                    \
+                tab_w[6], tab_w[7],                    \
+                n, n2);                                \
+                                                       \
+    DFT2_LAZY_42_44(p0, p4, n2);                       \
+    DFT2_LAZY_42_44(p1, p5, n2);                       \
+    DFT2_LAZY_42_44(p2, p6, n2);                       \
+    DFT2_LAZY_42_44(p3, p7, n2);                       \
+} while(0)
+
+/*------------------------*/
+/* length 8, general node */
+/*------------------------*/
+
+/** 8-point FFT, evaluation, general node
+ * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial
+ * p(x) = p0 + p1*x + ... + p7*x**7, into its evaluations
+ *       p(w0), p(-w0), p(w1), p(-w1), p(w2), p(-w2), p(w3), p(-w3)
+ * where w_k = F->tab_w[8*node + 2*k] for 0 <= k < 4
+ * * By construction these 8 evaluation points are the 8 roots of the
+ * polynomial x**8 - F->tab_w[node]
+ * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n
+ */
+#define DFT8_NODE_LAZY_4_4(p0, p1, p2, p3, p4, p5, p6, p7,    \
+                           node, n, n2, tab_w)                \
+do {                                                          \
+    const ulong w = tab_w[2*(node)];                          \
+    const ulong w_pr = tab_w[2*(node)+1];                     \
+    DFT2_NODE_LAZY_4_4(p0, p4, w, w_pr, n, n2);               \
+    DFT2_NODE_LAZY_4_4(p1, p5, w, w_pr, n, n2);               \
+    DFT2_NODE_LAZY_4_4(p2, p6, w, w_pr, n, n2);               \
+    DFT2_NODE_LAZY_4_4(p3, p7, w, w_pr, n, n2);               \
+                                                              \
+    DFT4_NODE_LAZY_4_4(p0, p1, p2, p3,                        \
+                       tab_w[4*(node)], tab_w[4*(node)+1],    \
+                       tab_w[8*(node)], tab_w[8*(node)+1],    \
+                       tab_w[8*(node)+2], tab_w[8*(node)+3],  \
+                       n, n2);                                \
+                                                              \
+    DFT4_NODE_LAZY_4_4(p4, p5, p6, p7,                        \
+                       tab_w[4*(node)+2], tab_w[4*(node)+3],  \
+                       tab_w[8*(node)+4], tab_w[8*(node)+5],  \
+                       tab_w[8*(node)+6], tab_w[8*(node)+7],  \
+                       n, n2);                                \
+} while(0)
+
+/** 8-point FFT, interpolation, general node
+ * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as the evaluations
+ *       [p(w0), p(-w0), p(w1), p(-w1), p(w2), p(-w2), p(w3), p(-w3)]
+ * where w_k = F->tab_w[8*node + 2*k] for 0 <= k < 4 of a polynomial p(x) of
+ * degree < 8, into the coefficients of this polynomial 
+ * * By construction these 8 evaluation points are the 8 roots of the
+ * polynomial x**8 - F->tab_w[node]
+ * * lazy_1_2: in [0..n) / out [0..2n) / max < 4n
+ * * lazy_2_2: in [0..2n) / out [0..2n) / max < 4n
+ */
+#define IDFT8_NODE_LAZY_1_2(p0, p1, p2, p3, p4, p5, p6, p7,    \
+                            node, n, n2, tab_w)                \
+do {                                                           \
+    const ulong w = tab_w[2*(node)];                           \
+    const ulong w_pr = tab_w[2*(node)+1];                      \
+                                                               \
+    IDFT4_NODE_LAZY_1_2(p0, p1, p2, p3,                        \
+                        tab_w[4*(node)], tab_w[4*(node)+1],    \
+                        tab_w[8*(node)], tab_w[8*(node)+1],    \
+                        tab_w[8*(node)+2], tab_w[8*(node)+3],  \
+                        n, n2);                                \
+                                                               \
+    IDFT4_NODE_LAZY_1_2(p4, p5, p6, p7,                        \
+                        tab_w[4*(node)+2], tab_w[4*(node)+3],  \
+                        tab_w[8*(node)+4], tab_w[8*(node)+5],  \
+                        tab_w[8*(node)+6], tab_w[8*(node)+7],  \
+                        n, n2);                                \
+                                                               \
+    IDFT2_NODE_LAZY_2_2(p0, p4, w, w_pr, n, n2);               \
+    IDFT2_NODE_LAZY_2_2(p1, p5, w, w_pr, n, n2);               \
+    IDFT2_NODE_LAZY_2_2(p2, p6, w, w_pr, n, n2);               \
+    IDFT2_NODE_LAZY_2_2(p3, p7, w, w_pr, n, n2);               \
+} while(0)
+
+/*-------------------*/
+/* length 16, node 0 */
+/*-------------------*/
+
+/** 16-point FFT, evaluation
+ * * In-place transform p of length 16, seen as a polynomial
+ * p(x) = p0 + p1*x + ... + p15*x**15, into its evaluations
+ * at all 16-th roots of unity 1, -1, I, -I... (bit-reversed order)
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ * * lazy_2_4: in [0..2n) / out [0..4n) / max < 4n
+ */
+#define DFT16_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7,           \
+                       p8, p9, p10, p11, p12, p13, p14, p15,     \
+                       n, n2, tab_w)                             \
+do {                                                             \
+    DFT4_LAZY_1_4(p0, p4, p8, p12, tab_w[2], tab_w[3], n, n2);   \
+    if (p0 >= n2)                                                \
+        p0 -= n2;                                                \
+    DFT4_LAZY_1_4(p1, p5, p9, p13, tab_w[2], tab_w[3], n, n2);   \
+    if (p1 >= n2)                                                \
+        p1 -= n2;                                                \
+    DFT4_LAZY_1_4(p2, p6, p10, p14, tab_w[2], tab_w[3], n, n2);  \
+    if (p2 >= n2)                                                \
+        p2 -= n2;                                                \
+    DFT4_LAZY_1_4(p3, p7, p11, p15, tab_w[2], tab_w[3], n, n2);  \
+    if (p3 >= n2)                                                \
+        p3 -= n2;                                                \
+                                                                 \
+    /* next line requires < 2n,        */                        \
+    /* hence the four reductions above */                        \
+    DFT4_LAZY_2_4(p0, p1, p2, p3, tab_w[2], tab_w[3], n, n2);    \
+    DFT4_NODE_LAZY_4_4(p4, p5, p6, p7,                           \
+                       tab_w[2], tab_w[3],                       \
+                       tab_w[4], tab_w[5],                       \
+                       tab_w[6], tab_w[7],                       \
+                       n, n2);                                   \
+    DFT4_NODE_LAZY_4_4(p8, p9, p10, p11,                         \
+                       tab_w[4], tab_w[5],                       \
+                       tab_w[8], tab_w[9],                       \
+                       tab_w[10], tab_w[11],                     \
+                       n, n2);                                   \
+    DFT4_NODE_LAZY_4_4(p12, p13, p14, p15,                       \
+                       tab_w[6], tab_w[7],                       \
+                       tab_w[12], tab_w[13],                     \
+                       tab_w[14], tab_w[15],                     \
+                       n, n2);                                   \
+} while(0)
+
+#define DFT16_LAZY_2_4(p0, p1, p2, p3, p4, p5, p6, p7,           \
+                       p8, p9, p10, p11, p12, p13, p14, p15,     \
+                       n, n2, tab_w)                             \
+do {                                                             \
+    DFT4_LAZY_2_4(p0, p4, p8, p12, tab_w[2], tab_w[3], n, n2);   \
+    if (p0 >= n2)                                                \
+        p0 -= n2;                                                \
+    DFT4_LAZY_2_4(p1, p5, p9, p13, tab_w[2], tab_w[3], n, n2);   \
+    if (p1 >= n2)                                                \
+        p1 -= n2;                                                \
+    DFT4_LAZY_2_4(p2, p6, p10, p14, tab_w[2], tab_w[3], n, n2);  \
+    if (p2 >= n2)                                                \
+        p2 -= n2;                                                \
+    DFT4_LAZY_2_4(p3, p7, p11, p15, tab_w[2], tab_w[3], n, n2);  \
+    if (p3 >= n2)                                                \
+        p3 -= n2;                                                \
+                                                                 \
+    /* next line requires < 2n,        */                        \
+    /* hence the four reductions above */                        \
+    DFT4_LAZY_2_4(p0, p1, p2, p3, tab_w[2], tab_w[3], n, n2);    \
+    DFT4_NODE_LAZY_4_4(p4, p5, p6, p7,                           \
+                       tab_w[2], tab_w[3],                       \
+                       tab_w[4], tab_w[5],                       \
+                       tab_w[6], tab_w[7],                       \
+                       n, n2);                                   \
+    DFT4_NODE_LAZY_4_4(p8, p9, p10, p11,                         \
+                       tab_w[4], tab_w[5],                       \
+                       tab_w[8], tab_w[9],                       \
+                       tab_w[10], tab_w[11],                     \
+                       n, n2);                                   \
+    DFT4_NODE_LAZY_4_4(p12, p13, p14, p15,                       \
+                       tab_w[6], tab_w[7],                       \
+                       tab_w[12], tab_w[13],                     \
+                       tab_w[14], tab_w[15],                     \
+                       n, n2);                                   \
+} while(0)
+
+/** 16-point FFT, interpolation
+ * * In-place transform p of length 16, seen as the evaluations at all 16-th
+ * roots of unity  1, -1, I, -I... (bit-reversed order) of a polynomial p(x) of
+ * degree < 16, into the coefficients of this polynomial 
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ */
+#define IDFT16_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7,              \
+                        p8, p9, p10, p11, p12, p13, p14, p15,        \
+                        n, n2, tab_w)                                \
+do {                                                                 \
+    IDFT4_LAZY_1_4(p0, p1, p2, p3, tab_w[2], tab_w[3], n, n2);       \
+    IDFT4_NODE_LAZY_1_2(p4, p5, p6, p7,                              \
+                        tab_w[2], tab_w[3],                          \
+                        tab_w[4], tab_w[5],                          \
+                        tab_w[6], tab_w[7],                          \
+                        n, n2);                                      \
+    IDFT4_NODE_LAZY_1_2(p8, p9, p10, p11,                            \
+                        tab_w[4], tab_w[5],                          \
+                        tab_w[8], tab_w[9],                          \
+                        tab_w[10], tab_w[11],                        \
+                        n, n2);                                      \
+    IDFT4_NODE_LAZY_1_2(p12, p13, p14, p15,                          \
+                        tab_w[6], tab_w[7],                          \
+                        tab_w[12], tab_w[13],                        \
+                        tab_w[14], tab_w[15],                        \
+                        n, n2);                                      \
+                                                                     \
+    IDFT4_LAZY_4222_4(p0, p4, p8, p12, tab_w[2], tab_w[3], n, n2);   \
+    IDFT4_LAZY_4222_4(p1, p5, p9, p13, tab_w[2], tab_w[3], n, n2);   \
+    IDFT4_LAZY_4222_4(p2, p6, p10, p14, tab_w[2], tab_w[3], n, n2);  \
+    IDFT4_LAZY_4222_4(p3, p7, p11, p15, tab_w[2], tab_w[3], n, n2);  \
+} while(0)
+
+/*-------------------------*/
+/* length 16, general node */
+/*-------------------------*/
+
+/** 16-point FFT, evaluation, general node
+ * * In-place transform p of length 16, seen as a polynomial
+ * p(x) = p0 + p1*x + ... + p15*x**15, into its evaluations at 
+ *       p(w0), p(-w0), p(w1), p(-w1), ..., p(w7), p(-w7)
+ * where w_k = F->tab_w[16*node + 2*k] for 0 <= k < 8
+ * * By construction these 16 evaluation points are the 16 roots of the
+ * polynomial x**16 - F->tab_w[node]
+ * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n
+ */
+#define DFT16_NODE_LAZY_4_4(p0, p1, p2, p3, p4, p5, p6, p7,       \
+                            p8, p9, p10, p11, p12, p13, p14, p15, \
+                            node, n, n2, tab_w)                   \
+do {                                                              \
+    ulong w2, w2pre, w, wpre, Iw, Iwpre;                          \
+                                                                  \
+    w2 = tab_w[2*node];                                           \
+    w2pre = tab_w[2*node+1];                                      \
+    w = tab_w[4*node];                                            \
+    wpre = tab_w[4*node+1];                                       \
+    Iw = tab_w[4*node+2];                                         \
+    Iwpre = tab_w[4*node+3];                                      \
+                                                                  \
+    DFT4_NODE_LAZY_4_4(p0, p4, p8, p12,                           \
+                       w2, w2pre, w, wpre, Iw, Iwpre,             \
+                       n, n2);                                    \
+    DFT4_NODE_LAZY_4_4(p1, p5, p9, p13,                           \
+                       w2, w2pre, w, wpre, Iw, Iwpre,             \
+                       n, n2);                                    \
+    DFT4_NODE_LAZY_4_4(p2, p6, p10, p14,                          \
+                       w2, w2pre, w, wpre, Iw, Iwpre,             \
+                       n, n2);                                    \
+    DFT4_NODE_LAZY_4_4(p3, p7, p11, p15,                          \
+                       w2, w2pre, w, wpre, Iw, Iwpre,             \
+                       n, n2);                                    \
+                                                                  \
+    w2 = tab_w[8*node];                                           \
+    w2pre = tab_w[8*node+1];                                      \
+    w = tab_w[16*node];                                           \
+    wpre = tab_w[16*node+1];                                      \
+    Iw = tab_w[16*node+2];                                        \
+    Iwpre = tab_w[16*node+3];                                     \
+    DFT4_NODE_LAZY_4_4(p0, p1, p2, p3,                            \
+                       w2, w2pre, w, wpre, Iw, Iwpre,             \
+                       n, n2);                                    \
+                                                                  \
+    w2 = tab_w[8*node+2];                                         \
+    w2pre = tab_w[8*node+3];                                      \
+    w = tab_w[16*node+4];                                         \
+    wpre = tab_w[16*node+5];                                      \
+    Iw = tab_w[16*node+6];                                        \
+    Iwpre = tab_w[16*node+7];                                     \
+    DFT4_NODE_LAZY_4_4(p4, p5, p6, p7,                            \
+                       w2, w2pre, w, wpre, Iw, Iwpre,             \
+                       n, n2);                                    \
+                                                                  \
+    w2 = tab_w[8*node+4];                                         \
+    w2pre = tab_w[8*node+5];                                      \
+    w = tab_w[16*node+8];                                         \
+    wpre = tab_w[16*node+9];                                      \
+    Iw = tab_w[16*node+10];                                       \
+    Iwpre = tab_w[16*node+11];                                    \
+    DFT4_NODE_LAZY_4_4(p8, p9, p10, p11,                          \
+                       w2, w2pre, w, wpre, Iw, Iwpre,             \
+                       n, n2);                                    \
+                                                                  \
+    w2 = tab_w[8*node+6];                                         \
+    w2pre = tab_w[8*node+7];                                      \
+    w = tab_w[16*node+12];                                        \
+    wpre = tab_w[16*node+13];                                     \
+    Iw = tab_w[16*node+14];                                       \
+    Iwpre = tab_w[16*node+15];                                    \
+    DFT4_NODE_LAZY_4_4(p12, p13, p14, p15,                        \
+                       w2, w2pre, w, wpre, Iw, Iwpre,             \
+                       n, n2);                                    \
+} while(0)
+
+/** 16-point FFT, interpolation, general node
+ * * In-place transform p of length 16, seen as the evaluations at 
+ *       w0, -w0, w1, -w1, ..., w7, -w7
+ * where w_k = F->tab_w[16*node + 2*k] for 0 <= k < 8
+ * of a polynomial of degree < 16, into the coefficients of this polynomial 
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ */
+#define IDFT16_NODE_LAZY_1_2(p0, p1, p2, p3, p4, p5, p6, p7,       \
+                             p8, p9, p10, p11, p12, p13, p14, p15, \
+                             node, n, n2, tab_w)                   \
+do {                                                               \
+    ulong w2, w2pre, w, wpre, Iw, Iwpre;                           \
+                                                                   \
+    w2 = tab_w[8*node];                                            \
+    w2pre = tab_w[8*node+1];                                       \
+    w = tab_w[16*node];                                            \
+    wpre = tab_w[16*node+1];                                       \
+    Iw = tab_w[16*node+2];                                         \
+    Iwpre = tab_w[16*node+3];                                      \
+    IDFT4_NODE_LAZY_1_2(p0, p1, p2, p3,                            \
+                       w2, w2pre, w, wpre, Iw, Iwpre,              \
+                       n, n2);                                     \
+                                                                   \
+    w2 = tab_w[8*node+2];                                          \
+    w2pre = tab_w[8*node+3];                                       \
+    w = tab_w[16*node+4];                                          \
+    wpre = tab_w[16*node+5];                                       \
+    Iw = tab_w[16*node+6];                                         \
+    Iwpre = tab_w[16*node+7];                                      \
+    IDFT4_NODE_LAZY_1_2(p4, p5, p6, p7,                            \
+                       w2, w2pre, w, wpre, Iw, Iwpre,              \
+                       n, n2);                                     \
+                                                                   \
+    w2 = tab_w[8*node+4];                                          \
+    w2pre = tab_w[8*node+5];                                       \
+    w = tab_w[16*node+8];                                          \
+    wpre = tab_w[16*node+9];                                       \
+    Iw = tab_w[16*node+10];                                        \
+    Iwpre = tab_w[16*node+11];                                     \
+    IDFT4_NODE_LAZY_1_2(p8, p9, p10, p11,                          \
+                       w2, w2pre, w, wpre, Iw, Iwpre,              \
+                       n, n2);                                     \
+                                                                   \
+    w2 = tab_w[8*node+6];                                          \
+    w2pre = tab_w[8*node+7];                                       \
+    w = tab_w[16*node+12];                                         \
+    wpre = tab_w[16*node+13];                                      \
+    Iw = tab_w[16*node+14];                                        \
+    Iwpre = tab_w[16*node+15];                                     \
+    IDFT4_NODE_LAZY_1_2(p12, p13, p14, p15,                        \
+                       w2, w2pre, w, wpre, Iw, Iwpre,              \
+                       n, n2);                                     \
+                                                                   \
+    w2 = tab_w[2*node];                                            \
+    w2pre = tab_w[2*node+1];                                       \
+    w = tab_w[4*node];                                             \
+    wpre = tab_w[4*node+1];                                        \
+    Iw = tab_w[4*node+2];                                          \
+    Iwpre = tab_w[4*node+3];                                       \
+                                                                   \
+    IDFT4_NODE_LAZY_2_2(p0, p4, p8, p12,                           \
+                       w2, w2pre, w, wpre, Iw, Iwpre,              \
+                       n, n2);                                     \
+    IDFT4_NODE_LAZY_2_2(p1, p5, p9, p13,                           \
+                       w2, w2pre, w, wpre, Iw, Iwpre,              \
+                       n, n2);                                     \
+    IDFT4_NODE_LAZY_2_2(p2, p6, p10, p14,                          \
+                       w2, w2pre, w, wpre, Iw, Iwpre,              \
+                       n, n2);                                     \
+    IDFT4_NODE_LAZY_2_2(p3, p7, p11, p15,                          \
+                       w2, w2pre, w, wpre, Iw, Iwpre,              \
+                       n, n2);                                     \
+} while(0)
+
+
+/*-------------------*/
+/* length 32, node 0 */
+/*-------------------*/
+
+/** 32-point FFT, evaluation
+ * * In-place transform p of length 32, seen as a polynomial
+ * p(x) = p0 + p1*x + ... + p31*x**31, into its evaluations
+ * at all 32-th roots of unity 1, -1, I, -I... (bit-reversed order)
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ * * lazy_2_4: in [0..2n) / out [0..4n) / max < 4n
+ */
+#define DFT32_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7,                           \
+                       p8, p9, p10, p11, p12, p13, p14, p15,                     \
+                       p16, p17, p18, p19, p20, p21, p22, p23,                   \
+                       p24, p25, p26, p27, p28, p29, p30, p31,                   \
+                       n, n2, tab_w)                                             \
+do {                                                                             \
+    DFT4_LAZY_1_4(p0, p8, p16, p24, tab_w[2], tab_w[3], n, n2);                  \
+    if (p0 >= n2)                                                                \
+        p0 -= n2;                                                                \
+    DFT4_LAZY_1_4(p1, p9, p17, p25, tab_w[2], tab_w[3], n, n2);                  \
+    if (p1 >= n2)                                                                \
+        p1 -= n2;                                                                \
+    DFT4_LAZY_1_4(p2, p10, p18, p26, tab_w[2], tab_w[3], n, n2);                 \
+    if (p2 >= n2)                                                                \
+        p2 -= n2;                                                                \
+    DFT4_LAZY_1_4(p3, p11, p19, p27, tab_w[2], tab_w[3], n, n2);                 \
+    if (p3 >= n2)                                                                \
+        p3 -= n2;                                                                \
+    DFT4_LAZY_1_4(p4, p12, p20, p28, tab_w[2], tab_w[3], n, n2);                 \
+    if (p4 >= n2)                                                                \
+        p4 -= n2;                                                                \
+    DFT4_LAZY_1_4(p5, p13, p21, p29, tab_w[2], tab_w[3], n, n2);                 \
+    if (p5 >= n2)                                                                \
+        p5 -= n2;                                                                \
+    DFT4_LAZY_1_4(p6, p14, p22, p30, tab_w[2], tab_w[3], n, n2);                 \
+    if (p6 >= n2)                                                                \
+        p6 -= n2;                                                                \
+    DFT4_LAZY_1_4(p7, p15, p23, p31, tab_w[2], tab_w[3], n, n2);                 \
+    if (p7 >= n2)                                                                \
+        p7 -= n2;                                                                \
+                                                                                 \
+    /* next line requires < 2n, hence the 8 reductions above */                  \
+    DFT8_LAZY_2_4(p0, p1, p2, p3, p4, p5, p6, p7, n, n2, tab_w);                 \
+    DFT8_NODE_LAZY_4_4(p8, p9, p10, p11, p12, p13, p14, p15, 1, n, n2, tab_w);   \
+    DFT8_NODE_LAZY_4_4(p16, p17, p18, p19, p20, p21, p22, p23, 2, n, n2, tab_w); \
+    DFT8_NODE_LAZY_4_4(p24, p25, p26, p27, p28, p29, p30, p31, 3, n, n2, tab_w); \
+} while(0)
+
+#define DFT32_LAZY_2_4(p0, p1, p2, p3, p4, p5, p6, p7,                           \
+                       p8, p9, p10, p11, p12, p13, p14, p15,                     \
+                       p16, p17, p18, p19, p20, p21, p22, p23,                   \
+                       p24, p25, p26, p27, p28, p29, p30, p31,                   \
+                       n, n2, tab_w)                                             \
+do {                                                                             \
+    DFT4_LAZY_2_4(p0, p8, p16, p24, tab_w[2], tab_w[3], n, n2);                  \
+    if (p0 >= n2)                                                                \
+        p0 -= n2;                                                                \
+    DFT4_LAZY_2_4(p1, p9, p17, p25, tab_w[2], tab_w[3], n, n2);                  \
+    if (p1 >= n2)                                                                \
+        p1 -= n2;                                                                \
+    DFT4_LAZY_2_4(p2, p10, p18, p26, tab_w[2], tab_w[3], n, n2);                 \
+    if (p2 >= n2)                                                                \
+        p2 -= n2;                                                                \
+    DFT4_LAZY_2_4(p3, p11, p19, p27, tab_w[2], tab_w[3], n, n2);                 \
+    if (p3 >= n2)                                                                \
+        p3 -= n2;                                                                \
+    DFT4_LAZY_2_4(p4, p12, p20, p28, tab_w[2], tab_w[3], n, n2);                 \
+    if (p4 >= n2)                                                                \
+        p4 -= n2;                                                                \
+    DFT4_LAZY_2_4(p5, p13, p21, p29, tab_w[2], tab_w[3], n, n2);                 \
+    if (p5 >= n2)                                                                \
+        p5 -= n2;                                                                \
+    DFT4_LAZY_2_4(p6, p14, p22, p30, tab_w[2], tab_w[3], n, n2);                 \
+    if (p6 >= n2)                                                                \
+        p6 -= n2;                                                                \
+    DFT4_LAZY_2_4(p7, p15, p23, p31, tab_w[2], tab_w[3], n, n2);                 \
+    if (p7 >= n2)                                                                \
+        p7 -= n2;                                                                \
+                                                                                 \
+    /* next line requires < 2n, hence the 8 reductions above */                  \
+    DFT8_LAZY_2_4(p0, p1, p2, p3, p4, p5, p6, p7, n, n2, tab_w);                 \
+    DFT8_NODE_LAZY_4_4(p8, p9, p10, p11, p12, p13, p14, p15, 1, n, n2, tab_w);   \
+    DFT8_NODE_LAZY_4_4(p16, p17, p18, p19, p20, p21, p22, p23, 2, n, n2, tab_w); \
+    DFT8_NODE_LAZY_4_4(p24, p25, p26, p27, p28, p29, p30, p31, 3, n, n2, tab_w); \
+} while(0)
+
+/** 32-point FFT, interpolation
+ * * In-place transform p of length 32, seen as the evaluations at all 32-th
+ * roots of unity  1, -1, I, -I... (bit-reversed order) of a polynomial p(x) of
+ * degree < 32, into the coefficients of this polynomial 
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ */
+#define IDFT32_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7,                           \
+                        p8, p9, p10, p11, p12, p13, p14, p15,                     \
+                        p16, p17, p18, p19, p20, p21, p22, p23,                   \
+                        p24, p25, p26, p27, p28, p29, p30, p31,                   \
+                        n, n2, tab_w)                                             \
+do {                                                                              \
+    IDFT8_LAZY_1_4(p0, p1, p2, p3, p4, p5, p6, p7, n, n2, tab_w);                 \
+    IDFT8_NODE_LAZY_1_2(p8, p9, p10, p11, p12, p13, p14, p15, 1, n, n2, tab_w);   \
+    IDFT8_NODE_LAZY_1_2(p16, p17, p18, p19, p20, p21, p22, p23, 2, n, n2, tab_w); \
+    IDFT8_NODE_LAZY_1_2(p24, p25, p26, p27, p28, p29, p30, p31, 3, n, n2, tab_w); \
+                                                                                  \
+    IDFT4_LAZY_4222_4(p0, p8, p16, p24, tab_w[2], tab_w[3], n, n2);               \
+    IDFT4_LAZY_4222_4(p1, p9, p17, p25, tab_w[2], tab_w[3], n, n2);               \
+    IDFT4_LAZY_4222_4(p2, p10, p18, p26, tab_w[2], tab_w[3], n, n2);              \
+    IDFT4_LAZY_4222_4(p3, p11, p19, p27, tab_w[2], tab_w[3], n, n2);              \
+    IDFT4_LAZY_4222_4(p4, p12, p20, p28, tab_w[2], tab_w[3], n, n2);              \
+    IDFT4_LAZY_4222_4(p5, p13, p21, p29, tab_w[2], tab_w[3], n, n2);              \
+    IDFT4_LAZY_4222_4(p6, p14, p22, p30, tab_w[2], tab_w[3], n, n2);              \
+    IDFT4_LAZY_4222_4(p7, p15, p23, p31, tab_w[2], tab_w[3], n, n2);              \
+} while(0)
+
+/*-------------------------*/
+/* length 32, general node */
+/*-------------------------*/
+
+/** 32-point FFT, evaluation, general node
+ * * In-place transform p of length 32, seen as a polynomial
+ * p(x) = p0 + p1*x + ... + p31*x**31, into its evaluations at 
+ *       p(w0), p(-w0), p(w1), p(-w1), ..., p(w15), p(-w15)
+ * where w_k = F->tab_w[32*node + 2*k] for 0 <= k < 16
+ * * By construction these 32 evaluation points are the 32 roots of the
+ * polynomial x**32 - F->tab_w[node]
+ * * lazy_4_4: in [0..4n) / out [0..4n) / max < 4n
+ */
+#define DFT32_NODE_LAZY_4_4(p0, p1, p2, p3, p4, p5, p6, p7,                              \
+                            p8, p9, p10, p11, p12, p13, p14, p15,                        \
+                            p16, p17, p18, p19, p20, p21, p22, p23,                      \
+                            p24, p25, p26, p27, p28, p29, p30, p31,                      \
+                            node, n, n2, tab_w)                                          \
+do {                                                                                     \
+    ulong w2 = tab_w[2*node];                                                            \
+    ulong w2pre = tab_w[2*node+1];                                                       \
+    ulong w = tab_w[4*node];                                                             \
+    ulong wpre = tab_w[4*node+1];                                                        \
+    ulong Iw = tab_w[4*node+2];                                                          \
+    ulong Iwpre = tab_w[4*node+3];                                                       \
+    DFT4_NODE_LAZY_4_4(p0, p8, p16, p24, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);          \
+    DFT4_NODE_LAZY_4_4(p1, p9, p17, p25, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);          \
+    DFT4_NODE_LAZY_4_4(p2, p10, p18, p26, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+    DFT4_NODE_LAZY_4_4(p3, p11, p19, p27, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+    DFT4_NODE_LAZY_4_4(p4, p12, p20, p28, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+    DFT4_NODE_LAZY_4_4(p5, p13, p21, p29, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+    DFT4_NODE_LAZY_4_4(p6, p14, p22, p30, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+    DFT4_NODE_LAZY_4_4(p7, p15, p23, p31, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+                                                                                         \
+    DFT8_NODE_LAZY_4_4(p0, p1, p2, p3, p4, p5, p6, p7, 4*node, n, n2, tab_w);            \
+    DFT8_NODE_LAZY_4_4(p8, p9, p10, p11, p12, p13, p14, p15, 4*node+1, n, n2, tab_w);    \
+    DFT8_NODE_LAZY_4_4(p16, p17, p18, p19, p20, p21, p22, p23, 4*node+2, n, n2, tab_w);  \
+    DFT8_NODE_LAZY_4_4(p24, p25, p26, p27, p28, p29, p30, p31, 4*node+3, n, n2, tab_w);  \
+} while(0)
+
+/** 32-point FFT, interpolation, general node
+ * * In-place transform p of length 32, seen as the evaluations at 
+ *       w0, -w0, w1, -w1, ..., w15, -w15
+ * where w_k = F->tab_w[32*node + 2*k] for 0 <= k < 16 of a polynomial of
+ * degree < 32, into the coefficients of this polynomial 
+ * * lazy_1_4: in [0..n) / out [0..4n) / max < 4n
+ */
+#define IDFT32_NODE_LAZY_1_2(p0, p1, p2, p3, p4, p5, p6, p7,                              \
+                             p8, p9, p10, p11, p12, p13, p14, p15,                        \
+                             p16, p17, p18, p19, p20, p21, p22, p23,                      \
+                             p24, p25, p26, p27, p28, p29, p30, p31,                      \
+                             node, n, n2, tab_w)                                          \
+do {                                                                                      \
+    IDFT8_NODE_LAZY_1_2(p0, p1, p2, p3, p4, p5, p6, p7, 4*node, n, n2, tab_w);            \
+    IDFT8_NODE_LAZY_1_2(p8, p9, p10, p11, p12, p13, p14, p15, 4*node+1, n, n2, tab_w);    \
+    IDFT8_NODE_LAZY_1_2(p16, p17, p18, p19, p20, p21, p22, p23, 4*node+2, n, n2, tab_w);  \
+    IDFT8_NODE_LAZY_1_2(p24, p25, p26, p27, p28, p29, p30, p31, 4*node+3, n, n2, tab_w);  \
+                                                                                          \
+    ulong w2 = tab_w[2*node];                                                             \
+    ulong w2pre = tab_w[2*node+1];                                                        \
+    ulong w = tab_w[4*node];                                                              \
+    ulong wpre = tab_w[4*node+1];                                                         \
+    ulong Iw = tab_w[4*node+2];                                                           \
+    ulong Iwpre = tab_w[4*node+3];                                                        \
+    IDFT4_NODE_LAZY_2_2(p0, p8, p16, p24, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);          \
+    IDFT4_NODE_LAZY_2_2(p1, p9, p17, p25, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);          \
+    IDFT4_NODE_LAZY_2_2(p2, p10, p18, p26, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+    IDFT4_NODE_LAZY_2_2(p3, p11, p19, p27, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+    IDFT4_NODE_LAZY_2_2(p4, p12, p20, p28, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+    IDFT4_NODE_LAZY_2_2(p5, p13, p21, p29, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+    IDFT4_NODE_LAZY_2_2(p6, p14, p22, p30, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+    IDFT4_NODE_LAZY_2_2(p7, p15, p23, p31, w2, w2pre, w, wpre, Iw, Iwpre, n, n2);         \
+} while(0)
+
+#endif  /* N_FFT_MACROS_H */
diff --git a/src/n_fft/profile/p-dft.c b/src/n_fft/profile/p-dft.c
new file mode 100644
index 0000000000..b37a804c85
--- /dev/null
+++ b/src/n_fft/profile/p-dft.c
@@ -0,0 +1,197 @@
+#include "nmod_poly.h"
+#include "profiler.h"
+#include "nmod_vec.h"
+#include "fft_small.h"
+#include "n_fft.h"
+
+#define NUM_PRIMES 7
+
+typedef struct
+{
+   ulong prime;
+   ulong depth;
+} info_t;
+
+#define SAMPLE(fun, _variant)                                                    \
+void sample_##fun##_variant(void * arg, ulong count)                             \
+{                                                                                \
+    info_t * info = (info_t *) arg;                                              \
+    const ulong p = info->prime;                                                 \
+    const ulong depth = info->depth;                                             \
+                                                                                 \
+    const ulong len = (UWORD(1) << depth);                                       \
+    const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len));                \
+                                                                                 \
+    /* modulus, roots of unity */                                                \
+    n_fft_ctx_t F;                                                               \
+    n_fft_ctx_init2(F, depth, p);                                                \
+                                                                                 \
+    FLINT_TEST_INIT(state);                                                      \
+                                                                                 \
+    ulong * coeffs = _nmod_vec_init(len);                                        \
+    for (ulong k = 0; k < len; k++)                                              \
+        coeffs[k] = n_randint(state, p);                                         \
+                                                                                 \
+    for (ulong i = 0; i < count; i++)                                            \
+    {                                                                            \
+        prof_start();                                                            \
+        for (ulong j = 0; j < rep; j++)                                          \
+            n_fft_##fun##_variant(coeffs, depth, F);                             \
+        prof_stop();                                                             \
+    }                                                                            \
+                                                                                 \
+    _nmod_vec_clear(coeffs);                                                     \
+    n_fft_ctx_clear(F);                                                          \
+    FLINT_TEST_CLEAR(state);                                                     \
+}                                                                                \
+
+SAMPLE(dft, )
+SAMPLE(idft, )
+SAMPLE(dft_t, )
+SAMPLE(idft_t, )
+
+void sample_sd_fft(void * arg, ulong count)
+{
+    info_t * info = (info_t *) arg;
+    const ulong p = info->prime;
+    const ulong depth = info->depth;
+
+    const ulong len = UWORD(1) << depth;
+    const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len));
+
+    sd_fft_ctx_t Q;
+    sd_fft_ctx_init_prime(Q, p);
+    sd_fft_ctx_fit_depth(Q, depth);
+
+    ulong sz = sd_fft_ctx_data_size(depth)*sizeof(double);
+
+    FLINT_TEST_INIT(state);
+
+    nmod_t mod;
+    nmod_init(&mod, p);
+    ulong * coeffs = _nmod_vec_init(len);
+    _nmod_vec_randtest(coeffs, state, len, mod);
+
+    double* data = flint_aligned_alloc(4096, n_round_up(sz, 4096));
+    for (ulong i = 0; i < len; i++)
+        data[i] = coeffs[i];
+
+    for (ulong i = 0; i < count; i++)
+    {
+        prof_start();
+        for (ulong j = 0; j < rep; j++)
+            sd_fft_trunc(Q, data, depth, len, len);
+        prof_stop();
+    }
+
+    sd_fft_ctx_clear(Q);
+    FLINT_TEST_CLEAR(state);
+}
+
+int main()
+{
+    flint_printf("- depth is log(fft length)\n");
+    flint_printf("- timing DFT (length power of 2) for several bit lengths and depths\n");
+    flint_printf("depth\tsd_fft\tdft\tidft\tdft_t\tidft_t\n");
+
+    ulong primes[NUM_PRIMES] = {
+        786433,              // 20 bits, 1 + 2**18 * 3
+        1073479681,          // 30 bits, 1 + 2**30 - 2**18 == 1 + 2**18 * (2**12 - 1)
+        2013265921,          // 31 bits, 1 + 2**27 * 3 * 5
+        2748779069441,       // 42 bits, 1 + 2**39 * 5
+        1108307720798209,    // 50 bits, 1 + 2**44 * 3**2 * 7
+        1139410705724735489, // 60 bits, 1 + 2**52 * 11 * 23
+        4611686018427322369  // 62 bits: 1 + 2**62 - 2**16 == 1 + 2**16 * (2**46 - 1)
+    };
+    ulong max_depths[NUM_PRIMES] = { 18, 18, 25, 25, 25, 25, 16 };
+
+    for (ulong k = 4; k < 6; k++)
+    {
+        for (ulong depth = 3; depth <= max_depths[k]; depth++)
+        {
+            printf("%ld\t", depth);
+
+            info_t info;
+            info.prime = primes[k];
+            info.depth = depth;
+
+            const ulong len = UWORD(1) << depth;
+            const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len));
+
+            double min[5];
+            double max;
+
+            if (k < 5) prof_repeat(min+0, &max, sample_sd_fft, (void *) &info);
+            prof_repeat(min+1, &max, sample_dft, (void *) &info);
+            prof_repeat(min+2, &max, sample_idft, (void *) &info);
+            prof_repeat(min+3, &max, sample_dft_t, (void *) &info);
+            prof_repeat(min+4, &max, sample_idft_t, (void *) &info);
+
+            flint_printf("%.1e\t%.1e\t%.1e\t%.1e\t%.1e\n",
+                    min[0]/(double)1000000/rep,
+                    min[1]/(double)1000000/rep,
+                    min[2]/(double)1000000/rep,
+                    min[3]/(double)1000000/rep,
+                    min[4]/(double)1000000/rep
+                    );
+        }
+    }
+    return 0;
+}
+
+/** 50 bit prime, commit "introduce_nmod_fft f1852d1c5"
+ *
+ * Output on zen4 (AMD Ryzen 7 PRO 7840U)
+ *
+ * depth   sd_fft  dft     idft    dft_t   idft_t
+ * 3       1.5e-08 2.2e-08 2.0e-08 2.3e-08 1.8e-08
+ * 4       2.1e-08 4.4e-08 4.5e-08 4.3e-08 4.7e-08
+ * 5       2.7e-08 9.3e-08 1.1e-07 9.5e-08 1.1e-07
+ * 6       6.2e-08 2.2e-07 2.3e-07 2.0e-07 2.6e-07
+ * 7       1.2e-07 5.0e-07 5.9e-07 5.1e-07 5.6e-07
+ * 8       2.9e-07 1.2e-06 1.2e-06 1.1e-06 1.3e-06
+ * 9       5.7e-07 2.6e-06 2.8e-06 2.7e-06 2.8e-06
+ * 10      1.3e-06 5.7e-06 5.6e-06 5.2e-06 6.1e-06
+ * 11      2.9e-06 1.2e-05 1.3e-05 1.2e-05 1.3e-05
+ * 12      6.0e-06 2.7e-05 2.6e-05 2.5e-05 2.8e-05
+ * 13      1.3e-05 5.6e-05 6.0e-05 5.7e-05 6.0e-05
+ * 14      2.9e-05 1.2e-04 1.2e-04 1.1e-04 1.3e-04
+ * 15      5.9e-05 2.6e-04 2.7e-04 2.6e-04 2.7e-04
+ * 16      1.2e-04 5.6e-04 5.6e-04 5.1e-04 5.8e-04
+ * 17      2.7e-04 1.2e-03 1.2e-03 1.2e-03 1.2e-03
+ * 18      5.8e-04 2.5e-03 2.4e-03 2.3e-03 2.6e-03
+ * 19      1.2e-03 5.2e-03 5.4e-03 5.1e-03 5.4e-03
+ * 20      2.6e-03 1.1e-02 1.1e-02 1.0e-02 1.2e-02
+ * 21      6.0e-03 2.3e-02 2.3e-02 2.3e-02 2.4e-02
+ * 22      1.3e-02 5.0e-02 4.9e-02 4.6e-02 5.1e-02
+ * 23      2.8e-02 1.0e-01 1.1e-01 1.0e-01 1.1e-01
+ * 24      6.2e-02 2.2e-01 2.3e-01 2.0e-01 2.3e-01
+ * 25      1.3e-01 4.5e-01 4.5e-01 4.4e-01 4.7e-01
+ *
+ * Output on meteorlake (Intel(R) Core(TM) Ultra 7 165H)
+ *
+ * depth   sd_fft  dft     idft    dft_t   idft_t
+ * 3       1.9e-08 2.1e-08 1.6e-08 2.4e-08 1.3e-08
+ * 4       2.2e-08 4.6e-08 3.6e-08 4.5e-08 3.7e-08
+ * 5       3.0e-08 9.5e-08 9.8e-08 1.0e-07 9.0e-08
+ * 6       6.4e-08 2.3e-07 2.0e-07 2.0e-07 2.4e-07
+ * 7       1.3e-07 5.3e-07 5.0e-07 5.2e-07 5.3e-07
+ * 8       2.8e-07 1.2e-06 9.5e-07 9.8e-07 1.2e-06
+ * 9       6.4e-07 2.6e-06 2.3e-06 2.4e-06 2.6e-06
+ * 10      1.4e-06 5.7e-06 4.5e-06 4.6e-06 5.6e-06
+ * 11      3.0e-06 1.3e-05 1.1e-05 1.1e-05 1.3e-05
+ * 12      6.4e-06 2.7e-05 2.0e-05 2.1e-05 2.7e-05
+ * 13      1.4e-05 5.8e-05 4.8e-05 4.9e-05 5.8e-05
+ * 14      3.0e-05 1.2e-04 9.2e-05 9.6e-05 1.2e-04
+ * 15      6.3e-05 2.6e-04 2.1e-04 2.2e-04 2.5e-04
+ * 16      1.3e-04 5.4e-04 4.1e-04 4.2e-04 5.5e-04
+ * 17      2.8e-04 1.1e-03 9.4e-04 9.6e-04 1.1e-03
+ * 18      6.3e-04 2.4e-03 1.9e-03 2.0e-03 2.5e-03
+ * 19      1.3e-03 5.2e-03 4.3e-03 4.4e-03 5.1e-03
+ * 20      2.9e-03 1.1e-02 8.7e-03 8.9e-03 1.1e-02
+ * 21      6.4e-03 2.4e-02 2.1e-02 2.0e-02 2.4e-02
+ * 22      1.5e-02 5.3e-02 4.0e-02 4.1e-02 5.2e-02
+ * 23      3.0e-02 1.1e-01 9.2e-02 9.1e-02 1.1e-01
+ * 24      6.3e-02 2.3e-01 1.9e-01 1.8e-01 2.3e-01
+ * 25      1.4e-01 4.7e-01 4.1e-01 4.1e-01 4.7e-01
+ */
diff --git a/src/n_fft/profile/p-init.c b/src/n_fft/profile/p-init.c
new file mode 100644
index 0000000000..f19117066a
--- /dev/null
+++ b/src/n_fft/profile/p-init.c
@@ -0,0 +1,126 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "flint.h"
+#include "nmod.h"
+#include "profiler.h"
+#include "n_fft.h"
+
+#define num_primes 5
+
+typedef struct
+{
+   ulong prime;
+   ulong depth;
+   ulong maxdepth;
+} info_t;
+
+void sample_init2_root(void * arg, ulong count)
+{
+    info_t * info = (info_t *) arg;
+    ulong p = info->prime;
+    ulong depth = info->depth;
+    ulong maxdepth = info->maxdepth;
+
+    const ulong len = UWORD(1) << depth;
+    const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len));
+
+    // modulus, roots of unity
+    nmod_t mod;
+    nmod_init(&mod, p);
+    ulong cofactor = (p - 1) >> maxdepth;
+    ulong w0 = nmod_pow_ui(n_primitive_root_prime(p), cofactor, mod);
+    ulong w = nmod_pow_ui(w0, 1UL<<(maxdepth - depth), mod);
+
+    FLINT_TEST_INIT(state);
+
+    for (ulong i = 0; i < count; i++)
+    {
+        prof_start();
+        for (ulong j = 0; j < rep; j++)
+        {
+            n_fft_ctx_t F;
+            n_fft_ctx_init2_root(F, w, depth, cofactor, depth, p);
+            n_fft_ctx_clear(F);
+        }
+        prof_stop();
+    }
+
+    FLINT_TEST_CLEAR(state);
+}
+
+/*-----------------------------------------------------------------*/
+/* initialize context for FFT for several bit lengths and depths   */
+/*-----------------------------------------------------------------*/
+void time_fft_init(ulong * primes, ulong * max_depths)
+{
+    for (ulong depth = 3; depth <= 25; depth++)
+    {
+        printf("%ld\t", depth);
+        for (ulong k = 0; k < num_primes; k++)
+        {
+            if (depth <= max_depths[k])
+            {
+                info_t info;
+                info.prime = primes[k];
+                info.maxdepth = max_depths[k];
+                info.depth = depth;
+
+                const ulong len = UWORD(1) << depth;
+                const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len));
+
+                double min;
+                double max;
+
+                prof_repeat(&min, &max, sample_init2_root, (void *) &info);
+
+                flint_printf("%.1e|%.1e\t",
+                        min/(double)1000000/rep,
+                        min/(double)FLINT_CLOCK_SCALE_FACTOR/len/rep
+                        );
+            }
+            else
+                flint_printf("  na   |  na   \t");
+        }
+        flint_printf("\n");
+    }
+
+}
+
+/*------------------------------------------------------------*/
+/* main just calls time_init_set()                            */
+/*------------------------------------------------------------*/
+int main()
+{
+    printf("- depth == precomputing w**k, 0 <= k < 2**depth\n");
+    printf("- timing init FFT context + clear at this depth:\n");
+    printf("      t_raw == raw time\n");
+    printf("      t_unit == raw time divided by 2**depth * clock scale factor\n");
+    printf("\n");
+
+    printf("     \t    20 bits    \t    31 bits    \t    42 bits    \t    50 bits    \t    60 bits    \n");
+    printf("depth\tt_raw  | t_unit\tt_raw  | t_unit\tt_raw  | t_unit\tt_raw  | t_unit\tt_raw  | t_unit\n");
+
+    // TODO fix for FLINT_BITS==32
+    ulong primes[num_primes] = {
+        786433,              // 20 bits, 1 + 2**18 * 3
+        2013265921,          // 31 bits, 1 + 2**27 * 3 * 5
+        2748779069441,       // 42 bits, 1 + 2**39 * 5
+        1108307720798209,    // 50 bits, 1 + 2**44 * 3**2 * 7
+        1139410705724735489, // 60 bits, 1 + 2**52 * 11 * 23
+    };
+    ulong max_depths[num_primes] = { 18, 25, 25, 25, 25 };
+
+    time_fft_init(primes, max_depths);
+
+    return 0;
+}
+
diff --git a/src/n_fft/test/main.c b/src/n_fft/test/main.c
new file mode 100644
index 0000000000..5c82383b68
--- /dev/null
+++ b/src/n_fft/test/main.c
@@ -0,0 +1,33 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+/* Include functions *********************************************************/
+
+#include "t-init.c"
+#include "t-dft.c"
+#include "t-idft.c"
+#include "t-dft_t.c"
+#include "t-idft_t.c"
+
+/* Array of test functions ***************************************************/
+
+test_struct tests[] =
+{
+    TEST_FUNCTION(n_fft_ctx_init2),
+    TEST_FUNCTION(n_fft_dft),
+    TEST_FUNCTION(n_fft_idft),
+    TEST_FUNCTION(n_fft_dft_t),
+    TEST_FUNCTION(n_fft_idft_t),
+};
+
+/* main function *************************************************************/
+
+TEST_MAIN(tests)
diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c
new file mode 100644
index 0000000000..e6808a5e80
--- /dev/null
+++ b/src/n_fft/test/t-dft.c
@@ -0,0 +1,108 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "flint.h"
+#include "test_helpers.h"
+#include "ulong_extras.h"
+#include "nmod.h"
+#include "nmod_poly.h"
+#include "nmod_vec.h"
+#include "n_fft.h"
+
+#define MAX_EVAL_DEPTH 11
+
+TEST_FUNCTION_START(n_fft_dft, state)
+{
+    int i;
+
+    for (i = 0; i < 200 * flint_test_multiplier(); i++)
+    {
+        // take some FFT prime p with max_depth >= 10
+        ulong max_depth, prime;
+
+        // half of tests == fixed large prime, close to limit
+        // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1
+        // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1
+        if (i > 100)
+#if FLINT_BITS == 64
+            prime = UWORD(4611686018427322369);
+#else // FLINT_BITS == 32
+            prime = UWORD(1073479681);
+#endif
+        else
+        {
+            max_depth = MAX_EVAL_DEPTH + n_randint(state, 6);
+            prime = 1 + (UWORD(1) << max_depth);
+            while (! n_is_prime(prime))
+                prime += (UWORD(1) << max_depth);
+        }
+        max_depth = flint_ctz(prime-1);
+
+        nmod_t mod;
+        nmod_init(&mod, prime);
+
+        // init FFT root tables
+        n_fft_ctx_t F;
+        n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime);
+
+        // retrieve roots, used later for multipoint evaluation
+        nn_ptr roots = flint_malloc((UWORD(1) << MAX_EVAL_DEPTH) * sizeof(ulong));
+        for (ulong k = 0; k < (UWORD(1) << (MAX_EVAL_DEPTH-1)); k++)
+        {
+            roots[2*k] = F->tab_w[2*k];
+            roots[2*k+1] = prime - F->tab_w[2*k];  // < prime since F->tab_w[2*k] != 0
+        }
+
+        for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++)
+        {
+            const ulong len = (UWORD(1) << depth);
+
+            // choose random poly of degree < len
+            nmod_poly_t pol;
+            nmod_poly_init(pol, mod.n);
+            nmod_poly_randtest(pol, state, len);
+            // copy it for DFT
+            nn_ptr p = _nmod_vec_init(len);
+            _nmod_vec_set(p, pol->coeffs, len);
+
+            // evals via general multipoint evaluation
+            nn_ptr evals_br = _nmod_vec_init(len);
+            nmod_poly_evaluate_nmod_vec(evals_br, pol, roots, len);
+
+            // evals by DFT
+            n_fft_dft(p, depth, F);
+
+            int res = _nmod_vec_equal(evals_br, p, len);
+
+            if (!res)
+            {
+                TEST_FUNCTION_FAIL(
+                    "prime = %wu\n"
+                    "root of unity = %wu\n"
+                    "max_depth = %wu\n"
+                    "depth = %wu\n"
+                    "failed equality test\n",
+                    prime, F->tab_w2[2*(max_depth-2)], max_depth, depth);
+            }
+
+            _nmod_vec_clear(p);
+            nmod_poly_clear(pol);
+            _nmod_vec_clear(evals_br);
+        }
+
+        flint_free(roots);
+        n_fft_ctx_clear(F);
+    }
+
+    TEST_FUNCTION_END(state);
+}
+
+#undef MAX_EVAL_DEPTH
diff --git a/src/n_fft/test/t-dft_t.c b/src/n_fft/test/t-dft_t.c
new file mode 100644
index 0000000000..aa0e1d676e
--- /dev/null
+++ b/src/n_fft/test/t-dft_t.c
@@ -0,0 +1,130 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "flint.h"
+#include "test_helpers.h"
+#include "ulong_extras.h"
+#include "nmod.h"
+#include "nmod_vec.h"
+#include "n_fft.h"
+
+#define MAX_EVAL_DEPTH 9
+
+/** computes the weighted power sums
+ *      q == [PowerSum(p, w**j) for 0 <= j < len]
+ * where PowerSum(p, w**j) == sum(p[i] * w[i]**j for 0 <= i < len)
+ * and where roots == [w[i] for 0 <= i < len]
+ */
+static void t_dft_t_weighted_power_sums(nn_ptr q, nn_srcptr p, nn_ptr roots, ulong len, nmod_t mod)
+{
+    // initially w**0 == [1,..,1]:
+    nn_ptr w_pow_j = _nmod_vec_init(len);
+    for (ulong i = 0; i < len; i++)
+        w_pow_j[i] = 1;
+
+    for (ulong j = 0; j < len; j++)
+    {
+        // at this stage, w_pow_j holds [w[i]**j for 0 <= i < len]
+        q[j] = 0;
+        for (ulong i = 0; i < len; i++)
+        {
+            q[j] = nmod_add(q[j], 
+                            nmod_mul(p[i], w_pow_j[i], mod),
+                            mod);
+            w_pow_j[i] = nmod_mul(w_pow_j[i], roots[i], mod);
+        }
+    }
+    _nmod_vec_clear(w_pow_j);
+}
+
+TEST_FUNCTION_START(n_fft_dft_t, state)
+{
+    int i;
+
+    for (i = 0; i < 200 * flint_test_multiplier(); i++)
+    {
+        // take some FFT prime p with max_depth >= 10
+        ulong max_depth, prime;
+
+        // half of tests == fixed large prime, close to limit
+        // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1
+        // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1
+        if (i > 100)
+#if FLINT_BITS == 64
+            prime = UWORD(4611686018427322369);
+#else // FLINT_BITS == 32
+            prime = UWORD(1073479681);
+#endif
+        else
+        {
+            max_depth = MAX_EVAL_DEPTH + n_randint(state, 6);
+            prime = 1 + (UWORD(1) << max_depth);
+            while (! n_is_prime(prime))
+                prime += (UWORD(1) << max_depth);
+        }
+        max_depth = flint_ctz(prime-1);
+
+        nmod_t mod;
+        nmod_init(&mod, prime);
+
+        // init FFT root tables
+        n_fft_ctx_t F;
+        n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime);
+
+        // retrieve roots
+        nn_ptr roots = flint_malloc((UWORD(1) << MAX_EVAL_DEPTH) * sizeof(ulong));
+        for (ulong k = 0; k < (UWORD(1) << (MAX_EVAL_DEPTH-1)); k++)
+        {
+            roots[2*k] = F->tab_w[2*k];
+            roots[2*k+1] = prime - F->tab_w[2*k];  // < prime since F->tab_w[2*k] != 0
+        }
+
+        for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++)
+        {
+            const ulong len = (UWORD(1) << depth);
+
+            // construct random array of length len
+            nn_ptr p = _nmod_vec_init(len);
+            for (ulong k = 0; k < len; k++)
+                p[k] = n_randint(state, prime);
+            // copy it before in-place transform
+            ulong * q = _nmod_vec_init(len);
+            _nmod_vec_set(q, p, len);
+
+            // naive weighted power sums
+            t_dft_t_weighted_power_sums(q, p, roots, len, mod);
+
+            // transposed DFT
+            n_fft_dft_t(p, depth, F);
+
+            int res = _nmod_vec_equal(p, q, len);
+
+            if (!res)
+                TEST_FUNCTION_FAIL(
+                        "prime = %wu\n"
+                        "root of unity = %wu\n"
+                        "max_depth = %wu\n"
+                        "depth = %wu\n"
+                        "failed equality test\n",
+                        prime, F->tab_w2[2*(max_depth-2)], max_depth, depth);
+
+            _nmod_vec_clear(p);
+            _nmod_vec_clear(q);
+        }
+
+        flint_free(roots);
+        n_fft_ctx_clear(F);
+    }
+
+    TEST_FUNCTION_END(state);
+}
+
+#undef MAX_EVAL_DEPTH
diff --git a/src/n_fft/test/t-idft.c b/src/n_fft/test/t-idft.c
new file mode 100644
index 0000000000..b1085e7590
--- /dev/null
+++ b/src/n_fft/test/t-idft.c
@@ -0,0 +1,107 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "flint.h"
+#include "test_helpers.h"
+#include "ulong_extras.h"
+#include "nmod.h"
+#include "nmod_poly.h"
+#include "nmod_vec.h"
+#include "n_fft.h"
+
+#define MAX_EVAL_DEPTH 10
+
+TEST_FUNCTION_START(n_fft_idft, state)
+{
+    int i;
+
+    for (i = 0; i < 200 * flint_test_multiplier(); i++)
+    {
+        // take some FFT prime p with max_depth >= 10
+        ulong max_depth, prime;
+
+        // half of tests == fixed large prime, close to limit
+        // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1
+        // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1
+        if (i > 100)
+#if FLINT_BITS == 64
+            prime = UWORD(4611686018427322369);
+#else // FLINT_BITS == 32
+            prime = UWORD(1073479681);
+#endif
+        else
+        {
+            max_depth = MAX_EVAL_DEPTH + n_randint(state, 6);
+            prime = 1 + (UWORD(1) << max_depth);
+            while (! n_is_prime(prime))
+                prime += (UWORD(1) << max_depth);
+        }
+        max_depth = flint_ctz(prime-1);
+
+        nmod_t mod;
+        nmod_init(&mod, prime);
+
+        // init FFT root tables
+        n_fft_ctx_t F;
+        n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime);
+
+        // retrieve roots, used later for multipoint evaluation
+        nn_ptr roots = flint_malloc((UWORD(1) << MAX_EVAL_DEPTH) * sizeof(ulong));
+        for (ulong k = 0; k < (UWORD(1) << (MAX_EVAL_DEPTH-1)); k++)
+        {
+            roots[2*k] = F->tab_w[2*k];
+            roots[2*k+1] = prime - F->tab_w[2*k];  // < prime since F->tab_w[2*k] != 0
+        }
+
+        for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++)
+        {
+            const ulong len = (UWORD(1) << depth);
+
+            // choose random evals of degree == len
+            nn_ptr p = flint_malloc(len * sizeof(ulong));
+            for (ulong k = 0; k < len; k++)
+                p[k] = n_randint(state, prime);
+
+            // general interpolation
+            nmod_poly_t pol;
+            nmod_poly_init(pol, prime);
+            nmod_poly_interpolate_nmod_vec(pol, roots, p, len);
+
+            // interpolate via IDFT
+            n_fft_idft(p, depth, F);
+
+            int res = _nmod_vec_equal(pol->coeffs, p, len);
+
+            if (!res)
+            {
+                _nmod_vec_print(p, len, mod);
+                _nmod_vec_print(pol->coeffs, len, mod);
+                TEST_FUNCTION_FAIL(
+                    "prime = %wu\n"
+                    "root of unity = %wu\n"
+                    "max_depth = %wu\n"
+                    "depth = %wu\n"
+                    "failed equality test\n",
+                    prime, F->tab_w2[2*(max_depth-2)], max_depth, depth);
+            }
+
+            _nmod_vec_clear(p);
+            nmod_poly_clear(pol);
+        }
+
+        flint_free(roots);
+        n_fft_ctx_clear(F);
+    }
+
+    TEST_FUNCTION_END(state);
+}
+
+#undef MAX_EVAL_DEPTH
diff --git a/src/n_fft/test/t-idft_t.c b/src/n_fft/test/t-idft_t.c
new file mode 100644
index 0000000000..b4a0cb1bf2
--- /dev/null
+++ b/src/n_fft/test/t-idft_t.c
@@ -0,0 +1,96 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "flint.h"
+#include "test_helpers.h"
+#include "ulong_extras.h"
+#include "nmod.h"
+#include "nmod_vec.h"
+#include "n_fft.h"
+
+#define MAX_EVAL_DEPTH 13
+
+TEST_FUNCTION_START(n_fft_idft_t, state)
+{
+    int i;
+
+    for (i = 0; i < 1000 * flint_test_multiplier(); i++)
+    {
+        // take some FFT prime p with max_depth >= 10
+        ulong max_depth, prime;
+
+        // half of tests == fixed large prime, close to limit
+        // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1
+        // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1
+        if (i > 100)
+#if FLINT_BITS == 64
+            prime = UWORD(4611686018427322369);
+#else // FLINT_BITS == 32
+            prime = UWORD(1073479681);
+#endif
+        else
+        {
+            max_depth = MAX_EVAL_DEPTH + n_randint(state, 6);
+            prime = 1 + (UWORD(1) << max_depth);
+            while (! n_is_prime(prime))
+                prime += (UWORD(1) << max_depth);
+        }
+        max_depth = flint_ctz(prime-1);
+
+        nmod_t mod;
+        nmod_init(&mod, prime);
+
+        // init FFT root tables
+        n_fft_ctx_t F;
+        n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime);
+
+        for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++)
+        {
+            const ulong len = (UWORD(1) << depth);
+
+            // construct random array of length len
+            nn_ptr p = _nmod_vec_init(len);
+            for (ulong k = 0; k < len; k++)
+                p[k] = n_randint(state, prime);
+            // copy it before in-place transform
+            nn_ptr q = _nmod_vec_init(len);
+            _nmod_vec_set(q, p, len);
+
+            // apply idft_t
+            n_fft_idft_t(p, depth, F);
+            // apply dft_t
+            n_fft_dft_t(p, depth, F);
+
+            // check dft_t o idft_t == 1
+            int res = _nmod_vec_equal(p, q, len);
+
+            if (!res)
+            {
+                TEST_FUNCTION_FAIL(
+                    "prime = %wu\n"
+                    "root of unity = %wu\n"
+                    "max_depth = %wu\n"
+                    "depth = %wu\n"
+                    "failed equality test\n",
+                    prime, F->tab_w2[2*(max_depth-2)], max_depth, depth);
+            }
+
+            _nmod_vec_clear(p);
+            _nmod_vec_clear(q);
+        }
+
+        n_fft_ctx_clear(F);
+    }
+
+    TEST_FUNCTION_END(state);
+}
+
+#undef MAX_EVAL_DEPTH
diff --git a/src/n_fft/test/t-init.c b/src/n_fft/test/t-init.c
new file mode 100644
index 0000000000..30449469c6
--- /dev/null
+++ b/src/n_fft/test/t-init.c
@@ -0,0 +1,163 @@
+/*
+    Copyright (C) 2024 Vincent Neiger
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "test_helpers.h"
+#include "ulong_extras.h"
+#include "n_fft.h"
+
+// return bit reversal index of k for given nbits:
+// e.g. br_index([0,1,2,3], 4) == [0, 8, 4, 12]
+static inline ulong br_index(ulong k, ulong nbits)
+{
+    k = ((k >> 1) & 0x55555555) | ((k & 0x55555555) << 1);
+    k = ((k >> 2) & 0x33333333) | ((k & 0x33333333) << 2);
+    k = ((k >> 4) & 0x0F0F0F0F) | ((k & 0x0F0F0F0F) << 4);
+    k = ((k >> 8) & 0x00FF00FF) | ((k & 0x00FF00FF) << 8);
+    k = ( k >> 16             ) | ( k               << 16);
+#if FLINT_BITS == 64
+    k = ( k >> 32             ) | ( k               << 32);
+#endif // FLINT_BITS == 64
+
+    return k >> (FLINT_BITS - nbits);
+}
+
+int test_one(n_fft_ctx_t F, ulong max_depth, ulong depth, ulong p, flint_rand_t state)
+{
+    // if depth < 3, init is supposed to behave as if depth == 3
+    depth = FLINT_MAX(3, depth);
+
+    // check all basic attributes
+    if (F->mod != p)
+        return 1;
+
+    if (F->max_depth != max_depth)
+        return 2;
+
+    if ((1 + (F->cofactor << max_depth)) != p)
+        return 3;
+
+    if (F->depth != depth)
+        return 4;
+
+    // retrieve primitive root and its inverse
+    const ulong w = F->tab_w2[2*(max_depth-2)];
+    const ulong iw = n_invmod(w, p);
+
+    // check the primitive root
+    if (n_powmod2(w, UWORD(1)<<max_depth, p) != UWORD(1)
+            || n_powmod2(w, UWORD(1)<<(max_depth-1), p) != p-UWORD(1))
+        return 5;
+
+    // check all entries of tab_w2
+    for (ulong k = 0; k < max_depth-1; k++)
+    {
+        ulong w2 = F->tab_w2[2*k];
+        if (w2 != n_powmod2(w, UWORD(1)<<(max_depth-2-k), p))
+            return 6;
+        if (F->tab_w2[2*k+1] != n_mulmod_precomp_shoup(w2, p))
+            return 7;
+    }
+
+    // check all entries of tab_inv2
+    for (ulong k = 0; k < max_depth; k++)
+    {
+        ulong inv2 = F->tab_inv2[2*k];
+        if (inv2 != n_invmod((UWORD(1)<<(k+1)), p))
+            return 8;
+        if (F->tab_inv2[2*k+1] != n_mulmod_precomp_shoup(inv2, p))
+            return 9;
+    }
+
+    // check a few random entries of tab_w and tab_iw
+    for (ulong j = 0; j < 1000; j++)
+    {
+        ulong k = n_randint(state, UWORD(1) << (F->depth - 1));
+        ulong exp = br_index(k, F->max_depth - 1);
+
+        ulong wk = F->tab_w[2*k];
+        if (wk != n_powmod2(w, exp, p))
+            return 10;
+        if (F->tab_w[2*k+1] != n_mulmod_precomp_shoup(wk, p))
+            return 11;
+
+        ulong iwk = F->tab_iw[2*k];
+        if (iwk != n_powmod2(iw, exp, p))
+            return 12;
+        if (F->tab_iw[2*k+1] != n_mulmod_precomp_shoup(iwk, p))
+            return 13;
+    }
+
+    return 0;
+}
+
+TEST_FUNCTION_START(n_fft_ctx_init2, state)
+{
+    int i;
+
+    for (i = 0; i < 1000 * flint_test_multiplier(); i++)
+    {
+        ulong p, max_depth;
+        if (i % 20 != 0)
+        {
+            // take random prime in [17, 2**(FLINT_BITS-2))
+#if FLINT_BITS == 64
+            ulong bits = 5 + n_randint(state, 58);
+#else
+            ulong bits = 5 + n_randint(state, 25);
+#endif
+            p = n_randprime(state, bits, 1);
+            max_depth = flint_ctz(p-1);
+
+            // we need p such that 8 divides p-1
+            while (max_depth < 3)
+            {
+                p = n_randprime(state, bits, 1);
+                max_depth = flint_ctz(p-1);
+            }
+        }
+        else
+        {
+            // the above will most often have max_depth 3 or 4
+            // every now and then we want p with larger max_depth
+#if FLINT_BITS == 64
+            max_depth = 40 + n_randint(state, 10);
+#else
+            max_depth = 10 + n_randint(state, 10);
+#endif
+            p = 1 + (UWORD(1) << max_depth);
+            while (! n_is_prime(p))
+                p += (UWORD(1) << max_depth);
+            max_depth = flint_ctz(p-1);
+        }
+
+        // take depth between 0 and min(12, max_depth)
+        ulong depth = n_randint(state, FLINT_MIN(12, max_depth));
+
+        // init
+        n_fft_ctx_t F;
+        n_fft_ctx_init2(F, depth, p);
+        
+        int res = test_one(F, max_depth, depth, p, state);
+
+        if (res)
+            TEST_FUNCTION_FAIL(
+                    "prime = %wu\n"
+                    "root of unity = %wu\n"
+                    "max_depth = %wu\n"
+                    "depth = %wu\n"
+                    "error code = %wu\n",
+                    p, F->tab_w2[2*(max_depth-2)], max_depth, depth, res);
+
+        n_fft_ctx_clear(F);
+    }
+
+    TEST_FUNCTION_END(state);
+}
diff --git a/src/nmod_vec/profile/p-dot.c b/src/nmod_vec/profile/p-dot.c
index 6d226710be..217f715704 100644
--- a/src/nmod_vec/profile/p-dot.c
+++ b/src/nmod_vec/profile/p-dot.c
@@ -9,9 +9,9 @@
     (at your option) any later version.  See <https://www.gnu.org/licenses/>.
 */
 
-#include <ulong_extras.h>
 #include <stdlib.h>  // for atoi
 
+#include "ulong_extras.h"
 #include "profiler.h"
 #include "nmod.h"
 #include "nmod_vec.h"
diff --git a/src/ulong_extras/profile/p-powmod.c b/src/ulong_extras/profile/p-powmod.c
new file mode 100644
index 0000000000..0a8e00c10e
--- /dev/null
+++ b/src/ulong_extras/profile/p-powmod.c
@@ -0,0 +1,152 @@
+/*
+   Copyright 2024 (C) Vincent Neiger
+
+   This file is part of FLINT.
+
+   FLINT is free software: you can redistribute it and/or modify it under
+   the terms of the GNU Lesser General Public License (LGPL) as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+   */
+
+#include "profiler.h"
+#include "ulong_extras.h"
+#include "double_extras.h"
+
+#define NB_ITER 1000
+
+typedef struct
+{
+    ulong bits;
+    ulong exp;
+} info_t;
+
+
+void sample_preinv(void * arg, ulong count)
+{
+    info_t * info = (info_t *) arg;
+    ulong exp = info->exp;
+    ulong bits = info->bits;
+    nn_ptr array = (nn_ptr) flint_malloc(NB_ITER*sizeof(ulong));
+    FLINT_TEST_INIT(state);
+
+    for (ulong i = 0; i < count; i++)
+    {
+        ulong n = n_randbits(state, bits);  // 0 < n < 2**(FLINT_BITS)
+        ulong ninv = n_preinvert_limb(n);
+        ulong norm = flint_clz(n);
+
+        for (ulong j = 0; j < NB_ITER; j++)
+            array[j] = n_randint(state, n);  // 0 <= array[j] < n
+
+        prof_start();
+        for (ulong j = 0; j < NB_ITER; j++)
+            array[j] = n_powmod_ui_preinv(array[j], exp, n, ninv, norm);
+        prof_stop();
+    }
+
+    flint_free(array);
+    FLINT_TEST_CLEAR(state);
+}
+
+void sample_preinv2(void * arg, ulong count)
+{
+    info_t * info = (info_t *) arg;
+    ulong exp = info->exp;
+    ulong bits = info->bits;
+    nn_ptr array = (nn_ptr) flint_malloc(NB_ITER*sizeof(ulong));
+    FLINT_TEST_INIT(state);
+
+    for (ulong i = 0; i < count; i++)
+    {
+        ulong n = n_randbits(state, bits);  // 0 < n < 2**(FLINT_BITS)
+        ulong ninv = n_preinvert_limb(n);
+
+        for (ulong j = 0; j < NB_ITER; j++)
+            array[j] = n_randlimb(state);
+
+        prof_start();
+        for (ulong j = 0; j < NB_ITER; j++)
+            array[j] = n_powmod2_ui_preinv(array[j], exp, n, ninv);
+        prof_stop();
+    }
+
+    flint_free(array);
+    FLINT_TEST_CLEAR(state);
+}
+
+void sample_precomp(void * arg, ulong count)
+{
+    info_t * info = (info_t *) arg;
+    ulong exp = info->exp;
+    ulong bits = info->bits;
+    nn_ptr array = (nn_ptr) flint_malloc(NB_ITER*sizeof(ulong));
+    FLINT_TEST_INIT(state);
+
+    for (ulong i = 0; i < count; i++)
+    {
+        ulong n = n_randbits(state, bits);  // 0 < n < 2**bits
+        double ninv = n_precompute_inverse(n);
+
+        for (ulong j = 0; j < NB_ITER; j++)
+            array[j] = n_randint(state, n);  // 0 <= array[j] < n
+
+        prof_start();
+        for (ulong j = 0; j < NB_ITER; j++)
+            array[j] = n_powmod_ui_precomp(array[j], exp, n, ninv);
+        prof_stop();
+    }
+
+    flint_free(array);
+    FLINT_TEST_CLEAR(state);
+}
+
+int main(void)
+{
+    double min, max;
+
+    const ulong bits_nb = 5;
+    ulong bits_list[] = {20, 30, 50, 60, 64};
+    const ulong exp_nb = 11;
+    ulong exp_list[] = {5, 10, 20, 40, 80, 160, 1000, 10000, 100000, 1000000L, 10000000L};
+
+    flint_printf("compute an exponentiation a**e mod n, with nbits(n) = b\n");
+    flint_printf("  computation is repeated on the element of a %wu-length array\n");
+    flint_printf("  time is divided by %wu * FLINT_CLOCK_SCALE_FACTOR * log_2(exp)\n", NB_ITER, NB_ITER);
+    flint_printf("timings are: powmod_ui_precomp | powmod_ui_preinv | powmod2_ui_preinv\n");
+    flint_printf("b \\ e\t");
+    for (ulong e = 0; e < exp_nb; e++)
+        flint_printf("%wu\t\t", exp_list[e]);
+    flint_printf("\n");
+
+    info_t info;
+
+    for (ulong b = 0; b < bits_nb; b++)
+    {
+        info.bits = bits_list[b];
+        flint_printf("%wu\t", info.bits);
+
+        for (ulong e = 0; e < exp_nb; e++)
+        {
+            info.exp = exp_list[e];
+            double log_exp = d_log2((double)info.exp);
+
+            if (info.bits <= 53)
+            {
+                prof_repeat(&min, &max, sample_precomp, (void *) &info);
+                flint_printf("%4.1f|", min/(NB_ITER * FLINT_CLOCK_SCALE_FACTOR * log_exp));
+            }
+            else
+                flint_printf(" na |");
+
+            prof_repeat(&min, &max, sample_preinv, (void *) &info);
+            flint_printf("%4.1f|", min/(NB_ITER * FLINT_CLOCK_SCALE_FACTOR * log_exp));
+
+            prof_repeat(&min, &max, sample_preinv2, (void *) &info);
+            flint_printf("%4.1f\t", min/(NB_ITER * FLINT_CLOCK_SCALE_FACTOR * log_exp));
+        }
+        flint_printf("\n");
+    }
+
+    return 0;
+}