From 0bf01272967539d2ed325517ac78bb4349ea62b9 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 16 Sep 2024 11:24:39 +0200 Subject: [PATCH 01/71] add profile for powmod --- src/ulong_extras/profile/p-powmod.c | 152 ++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 src/ulong_extras/profile/p-powmod.c diff --git a/src/ulong_extras/profile/p-powmod.c b/src/ulong_extras/profile/p-powmod.c new file mode 100644 index 0000000000..0a8e00c10e --- /dev/null +++ b/src/ulong_extras/profile/p-powmod.c @@ -0,0 +1,152 @@ +/* + Copyright 2024 (C) Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . + */ + +#include "profiler.h" +#include "ulong_extras.h" +#include "double_extras.h" + +#define NB_ITER 1000 + +typedef struct +{ + ulong bits; + ulong exp; +} info_t; + + +void sample_preinv(void * arg, ulong count) +{ + info_t * info = (info_t *) arg; + ulong exp = info->exp; + ulong bits = info->bits; + nn_ptr array = (nn_ptr) flint_malloc(NB_ITER*sizeof(ulong)); + FLINT_TEST_INIT(state); + + for (ulong i = 0; i < count; i++) + { + ulong n = n_randbits(state, bits); // 0 < n < 2**(FLINT_BITS) + ulong ninv = n_preinvert_limb(n); + ulong norm = flint_clz(n); + + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_randint(state, n); // 0 <= array[j] < n + + prof_start(); + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_powmod_ui_preinv(array[j], exp, n, ninv, norm); + prof_stop(); + } + + flint_free(array); + FLINT_TEST_CLEAR(state); +} + +void sample_preinv2(void * arg, ulong count) +{ + info_t * info = (info_t *) arg; + ulong exp = info->exp; + ulong bits = info->bits; + nn_ptr array = (nn_ptr) flint_malloc(NB_ITER*sizeof(ulong)); + FLINT_TEST_INIT(state); + + for (ulong i = 0; i < count; i++) + { + ulong n = n_randbits(state, bits); // 0 < n < 2**(FLINT_BITS) + ulong ninv = n_preinvert_limb(n); + + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_randlimb(state); + + prof_start(); + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_powmod2_ui_preinv(array[j], exp, n, ninv); + prof_stop(); + } + + flint_free(array); + FLINT_TEST_CLEAR(state); +} + +void sample_precomp(void * arg, ulong count) +{ + info_t * info = (info_t *) arg; + ulong exp = info->exp; + ulong bits = info->bits; + nn_ptr array = (nn_ptr) flint_malloc(NB_ITER*sizeof(ulong)); + FLINT_TEST_INIT(state); + + for (ulong i = 0; i < count; i++) + { + ulong n = n_randbits(state, bits); // 0 < n < 2**bits + double ninv = n_precompute_inverse(n); + + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_randint(state, n); // 0 <= array[j] < n + + prof_start(); + for (ulong j = 0; j < NB_ITER; j++) + array[j] = n_powmod_ui_precomp(array[j], exp, n, ninv); + prof_stop(); + } + + flint_free(array); + FLINT_TEST_CLEAR(state); +} + +int main(void) +{ + double min, max; + + const ulong bits_nb = 5; + ulong bits_list[] = {20, 30, 50, 60, 64}; + const ulong exp_nb = 11; + ulong exp_list[] = {5, 10, 20, 40, 80, 160, 1000, 10000, 100000, 1000000L, 10000000L}; + + flint_printf("compute an exponentiation a**e mod n, with nbits(n) = b\n"); + flint_printf(" computation is repeated on the element of a %wu-length array\n"); + flint_printf(" time is divided by %wu * FLINT_CLOCK_SCALE_FACTOR * log_2(exp)\n", NB_ITER, NB_ITER); + flint_printf("timings are: powmod_ui_precomp | powmod_ui_preinv | powmod2_ui_preinv\n"); + flint_printf("b \\ e\t"); + for (ulong e = 0; e < exp_nb; e++) + flint_printf("%wu\t\t", exp_list[e]); + flint_printf("\n"); + + info_t info; + + for (ulong b = 0; b < bits_nb; b++) + { + info.bits = bits_list[b]; + flint_printf("%wu\t", info.bits); + + for (ulong e = 0; e < exp_nb; e++) + { + info.exp = exp_list[e]; + double log_exp = d_log2((double)info.exp); + + if (info.bits <= 53) + { + prof_repeat(&min, &max, sample_precomp, (void *) &info); + flint_printf("%4.1f|", min/(NB_ITER * FLINT_CLOCK_SCALE_FACTOR * log_exp)); + } + else + flint_printf(" na |"); + + prof_repeat(&min, &max, sample_preinv, (void *) &info); + flint_printf("%4.1f|", min/(NB_ITER * FLINT_CLOCK_SCALE_FACTOR * log_exp)); + + prof_repeat(&min, &max, sample_preinv2, (void *) &info); + flint_printf("%4.1f\t", min/(NB_ITER * FLINT_CLOCK_SCALE_FACTOR * log_exp)); + } + flint_printf("\n"); + } + + return 0; +} From 4a887b45cbdd3d891e9eac06e988a7221ee5744c Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 20 Oct 2024 00:21:32 +0200 Subject: [PATCH 02/71] add .h file --- src/n_fft.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 src/n_fft.h diff --git a/src/n_fft.h b/src/n_fft.h new file mode 100644 index 0000000000..08b4728da9 --- /dev/null +++ b/src/n_fft.h @@ -0,0 +1,24 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#ifndef N_POLY_TYPES_H +#define N_POLY_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* N_FFT_H */ From aee38b3c5e27fb3a2b0cf203165f4bfca59a832d Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 20 Oct 2024 00:21:49 +0200 Subject: [PATCH 03/71] fix ifndef --- src/n_fft.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 08b4728da9..34e5dea79a 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -9,8 +9,8 @@ (at your option) any later version. See . */ -#ifndef N_POLY_TYPES_H -#define N_POLY_TYPES_H +#ifndef N_FFT_H +#define N_FFT_H #ifdef __cplusplus extern "C" { From 17faaeac5737765d2e1203bccf4c1675bbfe26a8 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 20 Oct 2024 01:00:59 +0200 Subject: [PATCH 04/71] context and init code --- src/n_fft.h | 78 ++++++++++++++++++++++++++++++++ src/n_fft/init.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 src/n_fft/init.c diff --git a/src/n_fft.h b/src/n_fft.h index 34e5dea79a..9be4cd747f 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -12,11 +12,89 @@ #ifndef N_FFT_H #define N_FFT_H +#include "flint.h" + +#define N_FFT_CTX_DEFAULT_DEPTH 12 + #ifdef __cplusplus extern "C" { #endif +/** n_fft context: + * parameters and tabulated powers of the primitive root of unity "w". + **/ + +typedef struct +{ + ulong mod; // modulus, odd prime + ulong mod2; // 2*mod (storing helps for speed) + ulong mod4; // 4*mod (storing helps for speed) + ulong max_depth; // maximum supported depth (w has order 2**max_depth) + ulong depth; // depth supported by current precomputation + ulong * tab_w; // tabulated powers of w, see below + ulong tab_w2[128]; // powers w**(2**k), see below +} n_fft_ctx_struct; +typedef n_fft_ctx_struct n_fft_ctx_t[1]; + +/** Requirements (not checked upon init): + * - mod is an odd prime < 2**61 + * - max_depth must be >= 3 (so, 8 must divide mod - 1) + * Total memory cost of precomputations: <= 128 + 2**(depth+1) ulong's + * + * TODO[short term] confirm the limit on the modulus + * TODO[longer term] large depth can lead to heavy memory usage + * --> provide precomputation-free functions + **/ + +/** tab_w2: contains powers w**d for d a power of 2, + * and corresponding precomputations for modular multiplication: + * - for 0 <= k < max_depth-1, tab_w2[2*k] = w**(2**(max_depth-2-k)) + * and tab_w2[2*k+1] = floor(tab_w2[2*k] * 2**FLINT_BITS / mod) + * - for 2*max_depth <= k < 128, tab_w2[k] is undefined + * In particular the first elements are tab_w2 = [I, I_pr, J, J_pr, ...] + * where I is a square root of -1 and J is a square root of I. + **/ + +/** tab_w: contains 2**depth first powers of w**k in bit reversed order, + * and corresponding precomputations for modular multiplication: + * - for 0 <= k < 2**depth, tab_w[2*k] = w**(br[k]) + * and tab_w[2*k+1] = floor(tab_w[2*k] * 2**FLINT_BITS / mod) + * where br = [0, 2**(depth-1), 2**(depth-2), 3 * 2**(depth-2), ...] + * is the bit reversal permutation of length 2**depth + * (https://en.wikipedia.org/wiki/Bit-reversal_permutation). + **/ + + +/* note for init functions, when depth is provided: + * - if it is < 3, it is pretended that it is 3 + * - it it is more than F->max_depth (the maximum possible with the given + * prime), it is reduced to F->max_depth + **/ + +// initialize with given root and given depth +void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, ulong mod); + +// find primitive root, initialize with given depth +void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p); + +// same, with default depth +FLINT_INLINE void n_fft_ctx_init_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong p) +{ n_fft_ctx_init2_root(F, w, max_depth, N_FFT_CTX_DEFAULT_DEPTH, p); } + +FLINT_INLINE void n_fft_ctx_init(n_fft_ctx_t F, ulong p) +{ n_fft_ctx_init2(F, N_FFT_CTX_DEFAULT_DEPTH, p); } + +// grows F->depth and precomputations to support DFTs of depth up to depth +void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth); + +void n_fft_ctx_clear(n_fft_ctx_t F); + + + + + + #ifdef __cplusplus } #endif diff --git a/src/n_fft/init.c b/src/n_fft/init.c new file mode 100644 index 0000000000..7a944cc72e --- /dev/null +++ b/src/n_fft/init.c @@ -0,0 +1,113 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "ulong_extras.h" +#include "n_fft.h" + +void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, ulong p) +{ + if (depth < 3) + depth = 3; + if (max_depth < depth) + depth = max_depth; + + // fill basic attributes + F->mod = p; + F->mod2 = 2*p; + F->mod4 = 4*p; + F->max_depth = max_depth; + F->depth = 3; // to be able to call fit_depth below + + // fill tab_w2 + ulong pr_quo, pr_rem, ww; + ww = w; + n_mulmod_precomp_shoup_quo_rem(&pr_quo, &pr_rem, ww, p); + F->tab_w2[2*(max_depth-2)] = ww; + F->tab_w2[2*(max_depth-2)+1] = pr_quo; + for (slong k = max_depth-3; k >= 0; k--) + { + // ww <- ww**2 and its precomputed quotient + n_mulmod_and_precomp_shoup(&ww, &pr_quo, ww, ww, pr_quo, pr_rem, pr_quo, p); + pr_rem = n_mulmod_precomp_shoup_rem_from_quo(pr_quo, p); + F->tab_w2[2*k] = ww; + F->tab_w2[2*k+1] = pr_quo; + } + // at this stage, pr_quo and pr_rem are for k == 0 i.e. for I == tab_w2[0] + + // fill tab_w for depth 3 + ulong len = UWORD(1) << (depth-1); // len >= 4 + F->tab_w = (nn_ptr) flint_malloc(2*len * sizeof(ulong)); + + F->tab_w[0] = UWORD(1); + F->tab_w[1] = n_mulmod_precomp_shoup(UWORD(1), p); + F->tab_w[2] = F->tab_w2[0]; + F->tab_w[3] = F->tab_w2[1]; + F->tab_w[4] = F->tab_w2[2]; + F->tab_w[5] = F->tab_w2[3]; + n_mulmod_and_precomp_shoup(F->tab_w+6, F->tab_w+7, F->tab_w2[0], F->tab_w2[2], pr_quo, pr_rem, F->tab_w2[3], p); + + // complete tab_w up to specified depth + n_fft_ctx_fit_depth(F, depth); +} + +void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p) +{ + FLINT_ASSERT(p > 2 && flint_clz(p) >= 3); // 2 < p < 2**61 + FLINT_ASSERT(flint_ctz(p - UWORD(1)) >= 3); // p-1 divisible by 8 + + // find the constant and exponent such that p == c * 2**max_depth + 1 + const ulong max_depth = flint_ctz(p - UWORD(1)); + const ulong c = (p - UWORD(1)) >> max_depth; + + // find primitive root w of order 2**max_depth + const ulong prim_root = n_primitive_root_prime(p); + const ulong w = n_powmod2(prim_root, c, p); + + // fill all attributes and tables + n_fft_ctx_init2_root(F, w, max_depth, depth, p); +} + +void n_fft_ctx_clear(n_fft_ctx_t F) +{ + flint_free(F->tab_w); +} + +void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth) +{ + if (F->max_depth < depth) + depth = F->max_depth; + + if (depth > F->depth) + { + ulong len = UWORD(1) << (depth-1); // len >= 8 (since depth >= 4) + F->tab_w = flint_realloc(F->tab_w, 2*len * sizeof(ulong)); + + // tab_w[2] is w**(L/8) * tab_w[0], where L = 2**max_depth, + // tab_w[2*4,2*6] is w**(L/16) * tab_w[2*0,2*2], + // tab_w[2*8,2*10,2*12,2*14] is w**(L/32) * tab_w[2*0,2*2,2*4,2*6], etc. + // recall tab_w2[2*d] == w**(L / 2**(d+2)) + ulong d = F->depth - 1; + ulong llen = UWORD(1) << (F->depth-1); + ulong ww, pr_quo, pr_rem; + for ( ; llen < len; llen <<= 1, d += 1) + { + ww = F->tab_w2[2*d]; + pr_quo = F->tab_w2[2*d+1]; + pr_rem = n_mulmod_precomp_shoup_rem_from_quo(pr_quo, F->mod); + // for each k, tab_w[2*(k+llen)] <- ww * tab_w[2*k], and deduce precomputation + for (ulong k = 0; k < llen; k++) + n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*k, F->tab_w + 2*llen + 2*k+1, + ww, F->tab_w[2*k], + pr_quo, pr_rem, F->tab_w[2*k+1], F->mod); + } + F->depth = depth; + } +} From e7382569599e00aa1039983b2a846ffedbb92511 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 20 Oct 2024 01:18:54 +0200 Subject: [PATCH 05/71] add profile --- Makefile.in | 3 +- src/n_fft/profile/p-init.c | 104 +++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 src/n_fft/profile/p-init.c diff --git a/Makefile.in b/Makefile.in index 5b170b9069..91a975ba52 100644 --- a/Makefile.in +++ b/Makefile.in @@ -188,7 +188,8 @@ HEADER_DIRS := \ fmpz_mod_mpoly_factor fmpq_mpoly_factor \ fq_nmod_mpoly_factor fq_zech_mpoly_factor \ \ - fft @FFT_SMALL@ fmpz_poly_q fmpz_lll \ + fft n_fft @FFT_SMALL@ \ + fmpz_poly_q fmpz_lll \ n_poly arith qsieve aprcl \ \ nf nf_elem qfb \ diff --git a/src/n_fft/profile/p-init.c b/src/n_fft/profile/p-init.c new file mode 100644 index 0000000000..69058d5521 --- /dev/null +++ b/src/n_fft/profile/p-init.c @@ -0,0 +1,104 @@ +#include "flint.h" +#include "nmod.h" +#include "profiler.h" +#include "n_fft.h" + +#define num_primes 5 + +typedef struct +{ + ulong prime; + ulong depth; + ulong maxdepth; +} info_t; + +void sample_init2_root(void * arg, ulong count) +{ + info_t * info = (info_t *) arg; + ulong p = info->prime; + ulong depth = info->depth; + ulong maxdepth = info->maxdepth; + + const ulong len = UWORD(1) << depth; + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); + + // modulus, roots of unity + nmod_t mod; + nmod_init(&mod, p); + ulong w0 = nmod_pow_ui(n_primitive_root_prime(p), (p - 1) >> maxdepth, mod); + ulong w = nmod_pow_ui(w0, 1UL<<(maxdepth - depth), mod); + + FLINT_TEST_INIT(state); + + for (ulong i = 0; i < count; i++) + { + prof_start(); + for (ulong j = 0; j < rep; j++) + { + n_fft_ctx_t F; + n_fft_ctx_init2_root(F, w, depth, depth, p); + n_fft_ctx_clear(F); + } + prof_stop(); + } + + FLINT_TEST_CLEAR(state); +} + +/*-----------------------------------------------------------------*/ +/* initialize context for FFT for several bit lengths and depths */ +/*-----------------------------------------------------------------*/ +void time_fft_init(ulong * primes, ulong * max_depths) +{ + for (ulong k = 4; k < num_primes; k++) + { + for (ulong depth = 3; depth <= max_depths[k]; depth++) + { + printf("%ld\t", depth); + + info_t info; + info.prime = primes[k]; + info.maxdepth = max_depths[k]; + info.depth = depth; + + const ulong len = UWORD(1) << depth; + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); + + double min; + double max; + + prof_repeat(&min, &max, sample_init2_root, (void *) &info); + + flint_printf("\t%.1e|%.1e\t", + min/(double)FLINT_CLOCK_SCALE_FACTOR/len/rep, + min/(double)1000000/rep + ); + flint_printf("\n"); + } + } + +} + +/*------------------------------------------------------------*/ +/* main just calls time_init_set() */ +/*------------------------------------------------------------*/ +int main() +{ + printf("- depth is log(fft length)\n"); + printf("- timing init FFT context at this depth\n"); + printf("depth\t\tred init new\n"); + + ulong primes[num_primes] = { + 786433, // 20 bits, 1 + 2**18 * 3 + 2013265921, // 31 bits, 1 + 2**27 * 3 * 5 + 2748779069441, // 42 bits, 1 + 2**39 * 5 + 1108307720798209, // 50 bits, 1 + 2**44 * 3**2 * 7 + 1139410705724735489, // 60 bits, 1 + 2**52 * 11 * 23 + }; + ulong max_depths[num_primes] = { 18, 25, 25, 25, 25 }; + + time_fft_init(primes, max_depths); + + return 0; +} + From dcaede7ee5a2157f33aceeab3f8d370f0df550ed Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 20 Oct 2024 01:26:35 +0200 Subject: [PATCH 06/71] fix include --- src/n_fft/profile/p-init.c | 11 +++++++++++ src/nmod_vec/profile/p-dot.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/n_fft/profile/p-init.c b/src/n_fft/profile/p-init.c index 69058d5521..3330104481 100644 --- a/src/n_fft/profile/p-init.c +++ b/src/n_fft/profile/p-init.c @@ -1,3 +1,14 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + #include "flint.h" #include "nmod.h" #include "profiler.h" diff --git a/src/nmod_vec/profile/p-dot.c b/src/nmod_vec/profile/p-dot.c index 6d226710be..217f715704 100644 --- a/src/nmod_vec/profile/p-dot.c +++ b/src/nmod_vec/profile/p-dot.c @@ -9,9 +9,9 @@ (at your option) any later version. See . */ -#include #include // for atoi +#include "ulong_extras.h" #include "profiler.h" #include "nmod.h" #include "nmod_vec.h" From cd507874053cfc4f8d60fe4326e83eafb5a55acb Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 20 Oct 2024 01:40:46 +0200 Subject: [PATCH 07/71] improve profile init --- src/n_fft/profile/p-init.c | 58 ++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/src/n_fft/profile/p-init.c b/src/n_fft/profile/p-init.c index 3330104481..75378be6d1 100644 --- a/src/n_fft/profile/p-init.c +++ b/src/n_fft/profile/p-init.c @@ -61,31 +61,35 @@ void sample_init2_root(void * arg, ulong count) /*-----------------------------------------------------------------*/ void time_fft_init(ulong * primes, ulong * max_depths) { - for (ulong k = 4; k < num_primes; k++) + for (ulong depth = 3; depth <= 25; depth++) { - for (ulong depth = 3; depth <= max_depths[k]; depth++) + printf("%ld\t", depth); + for (ulong k = 0; k < num_primes; k++) { - printf("%ld\t", depth); - - info_t info; - info.prime = primes[k]; - info.maxdepth = max_depths[k]; - info.depth = depth; - - const ulong len = UWORD(1) << depth; - const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); - - double min; - double max; - - prof_repeat(&min, &max, sample_init2_root, (void *) &info); - - flint_printf("\t%.1e|%.1e\t", - min/(double)FLINT_CLOCK_SCALE_FACTOR/len/rep, - min/(double)1000000/rep - ); - flint_printf("\n"); + if (depth <= max_depths[k]) + { + info_t info; + info.prime = primes[k]; + info.maxdepth = max_depths[k]; + info.depth = depth; + + const ulong len = UWORD(1) << depth; + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); + + double min; + double max; + + prof_repeat(&min, &max, sample_init2_root, (void *) &info); + + flint_printf("%.1e|%.1e\t", + min/(double)1000000/rep, + min/(double)FLINT_CLOCK_SCALE_FACTOR/len/rep + ); + } + else + flint_printf(" na | na \t"); } + flint_printf("\n"); } } @@ -95,9 +99,13 @@ void time_fft_init(ulong * primes, ulong * max_depths) /*------------------------------------------------------------*/ int main() { - printf("- depth is log(fft length)\n"); - printf("- timing init FFT context at this depth\n"); - printf("depth\t\tred init new\n"); + printf("- depth == precomputing w**k, 0 <= k < 2**depth\n"); + printf("- timing init FFT context + clear at this depth:\n"); + printf(" t_raw == raw time\n"); + printf(" t_unit == raw time divided by 2**depth * clock scale factor\n"); + printf("\n"); + + printf("depth\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\n"); ulong primes[num_primes] = { 786433, // 20 bits, 1 + 2**18 * 3 From afa5ddccb8c93010c2bac76e1afdae76a387385e Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 20 Oct 2024 02:10:04 +0200 Subject: [PATCH 08/71] rename ctx init --- src/n_fft/{init.c => ctx_init.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/n_fft/{init.c => ctx_init.c} (100%) diff --git a/src/n_fft/init.c b/src/n_fft/ctx_init.c similarity index 100% rename from src/n_fft/init.c rename to src/n_fft/ctx_init.c From fd24de2a59ad701c7c2c5c70179c3bfdf08074ac Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 20 Oct 2024 02:10:23 +0200 Subject: [PATCH 09/71] testing init --- src/n_fft/test/t-init.c | 63 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 src/n_fft/test/t-init.c diff --git a/src/n_fft/test/t-init.c b/src/n_fft/test/t-init.c new file mode 100644 index 0000000000..bc149cfb77 --- /dev/null +++ b/src/n_fft/test/t-init.c @@ -0,0 +1,63 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "ulong_extras.h" +#include "n_fft.h" + +TEST_FUNCTION_START(n_fft_ctx_init2, state) +{ + int i; + + for (i = 0; i < 1000 * flint_test_multiplier(); i++) + { + // take random prime in [8, 2**61) + ulong bits = 4 + n_randint(state, 57); + ulong p = n_randprime(state, bits, 1); + ulong max_depth = flint_ctz(p-1); + + // we need p such that 8 divides p-1 + while (max_depth < 3) + { + p = n_randprime(state, bits, 1); + max_depth = flint_ctz(p-1); + } + + // take depth between 0 and min(12, max_depth) + ulong depth = n_randint(state, FLINT_MIN(10, max_depth)); + + // init + n_fft_ctx_t F; + n_fft_ctx_init2(F, depth, p); + + if (F->mod != p) + TEST_FUNCTION_FAIL( + "mod = %wu\n" + "F->mod = %wu\n", + p, F->mod); + + if (F->mod2 != 2*p) + TEST_FUNCTION_FAIL( + "F->mod = %wu\n" + "F->mod2 = %wu\n", + F->mod, F->mod2); + + if (F->mod4 != 4*p) + TEST_FUNCTION_FAIL( + "F->mod = %wu\n" + "F->mod4 = %wu\n", + F->mod, F->mod4); + + n_fft_ctx_clear(F); + } + + TEST_FUNCTION_END(state); +} From 211ab75e280aa646638939c0d57cc2dde2cbe891 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 20 Oct 2024 12:26:49 +0200 Subject: [PATCH 10/71] fix explanations and complete test for init --- src/n_fft.h | 35 ++++++++---- src/n_fft/test/t-init.c | 123 +++++++++++++++++++++++++++++++++------- 2 files changed, 124 insertions(+), 34 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 9be4cd747f..39cbf0f027 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -47,22 +47,33 @@ typedef n_fft_ctx_struct n_fft_ctx_t[1]; * --> provide precomputation-free functions **/ -/** tab_w2: contains powers w**d for d a power of 2, - * and corresponding precomputations for modular multiplication: - * - for 0 <= k < max_depth-1, tab_w2[2*k] = w**(2**(max_depth-2-k)) +/** tab_w2: + * - length 128, with undefined entries at index 2*max_depth and beyond + * - contains powers w**d for d a power of 2, and corresponding + * precomputations for modular multiplication: + * -- for 0 <= k < max_depth-1, tab_w2[2*k] = w**(2**(max_depth-2-k)) * and tab_w2[2*k+1] = floor(tab_w2[2*k] * 2**FLINT_BITS / mod) - * - for 2*max_depth <= k < 128, tab_w2[k] is undefined - * In particular the first elements are tab_w2 = [I, I_pr, J, J_pr, ...] + * -- for 2*max_depth <= k < 128, tab_w2[k] is undefined + * + * The first elements are tab_w2 = [I, I_pr, J, J_pr, ...] * where I is a square root of -1 and J is a square root of I. **/ -/** tab_w: contains 2**depth first powers of w**k in bit reversed order, - * and corresponding precomputations for modular multiplication: - * - for 0 <= k < 2**depth, tab_w[2*k] = w**(br[k]) - * and tab_w[2*k+1] = floor(tab_w[2*k] * 2**FLINT_BITS / mod) - * where br = [0, 2**(depth-1), 2**(depth-2), 3 * 2**(depth-2), ...] - * is the bit reversal permutation of length 2**depth - * (https://en.wikipedia.org/wiki/Bit-reversal_permutation). +/** tab_w: + * - length 2**depth + * - contains 2**(depth-1) first powers of w in (max_depth-1)-bit reversed order, + * and corresponding precomputations for modular multiplication: + * -- for 0 <= k < 2**(depth-1), tab_w[2*k] = w**(br[k]) + * and tab_w[2*k+1] = floor(tab_w[2*k] * 2**FLINT_BITS / mod) + * where br = [0, 2**(max_depth-2), 2**(max_depth-3), 3 * 2**(max_depth-3), ...] + * is the bit reversal permutation of length 2**(max_depth-1) + * (https://en.wikipedia.org/wiki/Bit-reversal_permutation) + * + * In particular the first elements are + * tab_w = [1, 1_pr, I, I_pr, J, J_pr, IJ, IJ_pr, ...] + * where I is a square root of -1, J is a square root of I, and IJ = I*J. Note + * that powers of w beyond 2**(max_depth-1), for example -1, -I, -J, etc. are + * not stored. **/ diff --git a/src/n_fft/test/t-init.c b/src/n_fft/test/t-init.c index bc149cfb77..de6c9ea245 100644 --- a/src/n_fft/test/t-init.c +++ b/src/n_fft/test/t-init.c @@ -13,48 +13,127 @@ #include "ulong_extras.h" #include "n_fft.h" +// return bit reversal index of k for given nbits: +// e.g. br_index([0,1,2,3], 4) == [0, 8, 4, 12] +static inline ulong br_index(ulong k, ulong nbits) +{ + k = ((k >> 1) & 0x55555555) | ((k & 0x55555555) << 1); + k = ((k >> 2) & 0x33333333) | ((k & 0x33333333) << 2); + k = ((k >> 4) & 0x0F0F0F0F) | ((k & 0x0F0F0F0F) << 4); + k = ((k >> 8) & 0x00FF00FF) | ((k & 0x00FF00FF) << 8); + k = ( k >> 16 ) | ( k << 16); +#if FLINT_BITS == 64 + k = ( k >> 32 ) | ( k << 32); +#endif // FLINT_BITS == 64 + + return k >> (FLINT_BITS - nbits); +} + +int test_one(n_fft_ctx_t F, ulong max_depth, ulong depth, ulong p, flint_rand_t state) +{ + // if depth < 3, init is supposed to behave as if depth == 3 + depth = FLINT_MAX(3, depth); + + // check all basic attributes + if (F->mod != p) + return 1; + + if (F->mod2 != 2*p) + return 2; + + if (F->mod4 != 4*p) + return 3; + + if (F->max_depth != max_depth) + return 4; + + if (F->depth != depth) + return 5; + + // check the primitive root + ulong w = F->tab_w2[2*(max_depth-2)]; + if (n_powmod2(w, UWORD(1)<tab_w2[2*k]; + if (w2 != n_powmod2(w, UWORD(1)<<(max_depth-2-k), p)) + return 7; + if (F->tab_w2[2*k+1] != n_mulmod_precomp_shoup(w2, p)) + return 8; + } + + // check a few random entries of tab_w + for (ulong j = 0; j < 1000; j++) + { + ulong k = n_randint(state, UWORD(1) << (F->depth - 1)); + ulong wk = F->tab_w[2*k]; + + ulong exp = br_index(k, F->max_depth - 1); + if (wk != n_powmod2(w, exp, p)) + return 9; + + if (F->tab_w[2*k+1] != n_mulmod_precomp_shoup(wk, p)) + return 10; + } + + return 0; +} + TEST_FUNCTION_START(n_fft_ctx_init2, state) { int i; for (i = 0; i < 1000 * flint_test_multiplier(); i++) { - // take random prime in [8, 2**61) - ulong bits = 4 + n_randint(state, 57); - ulong p = n_randprime(state, bits, 1); - ulong max_depth = flint_ctz(p-1); - - // we need p such that 8 divides p-1 - while (max_depth < 3) + ulong p, max_depth; + if (i % 20 != 0) { + // take random prime in [17, 2**61) + ulong bits = 5 + n_randint(state, 57); p = n_randprime(state, bits, 1); max_depth = flint_ctz(p-1); + + // we need p such that 8 divides p-1 + while (max_depth < 3) + { + p = n_randprime(state, bits, 1); + max_depth = flint_ctz(p-1); + } + } + else + { + // the above will most often have max_depth 3 or 4 + // every now and then we want p with larger max_depth + max_depth = 10 + n_randint(state, 10); + p = 1 + (UWORD(1) << max_depth); + while (! n_is_prime(p)) + p += (UWORD(1) << max_depth); + max_depth = flint_ctz(p-1); } // take depth between 0 and min(12, max_depth) ulong depth = n_randint(state, FLINT_MIN(10, max_depth)); + printf("%lu, %lu, %lu\n", p, max_depth, depth); + // init n_fft_ctx_t F; n_fft_ctx_init2(F, depth, p); - if (F->mod != p) - TEST_FUNCTION_FAIL( - "mod = %wu\n" - "F->mod = %wu\n", - p, F->mod); - - if (F->mod2 != 2*p) - TEST_FUNCTION_FAIL( - "F->mod = %wu\n" - "F->mod2 = %wu\n", - F->mod, F->mod2); + int res = test_one(F, max_depth, depth, p, state); - if (F->mod4 != 4*p) + if (res) TEST_FUNCTION_FAIL( - "F->mod = %wu\n" - "F->mod4 = %wu\n", - F->mod, F->mod4); + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "error code = %wu\n", + p, F->tab_w2[2*(max_depth-2)], max_depth, depth, res); n_fft_ctx_clear(F); } From 6368823fbce649f074771d01086c91d62c1c7c22 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 20 Oct 2024 12:36:36 +0200 Subject: [PATCH 11/71] remove printf --- src/n_fft/test/t-init.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/n_fft/test/t-init.c b/src/n_fft/test/t-init.c index de6c9ea245..b6164207a5 100644 --- a/src/n_fft/test/t-init.c +++ b/src/n_fft/test/t-init.c @@ -116,9 +116,7 @@ TEST_FUNCTION_START(n_fft_ctx_init2, state) } // take depth between 0 and min(12, max_depth) - ulong depth = n_randint(state, FLINT_MIN(10, max_depth)); - - printf("%lu, %lu, %lu\n", p, max_depth, depth); + ulong depth = n_randint(state, FLINT_MIN(12, max_depth)); // init n_fft_ctx_t F; From 9eeedd639041b412fedcb2f407f09fc1f43a03a4 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Tue, 22 Oct 2024 10:19:32 +0200 Subject: [PATCH 12/71] forgot to add main --- src/n_fft.h | 6 ++++++ src/n_fft/test/main.c | 25 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 src/n_fft/test/main.c diff --git a/src/n_fft.h b/src/n_fft.h index 39cbf0f027..6bfa594192 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -81,6 +81,7 @@ typedef n_fft_ctx_struct n_fft_ctx_t[1]; * - if it is < 3, it is pretended that it is 3 * - it it is more than F->max_depth (the maximum possible with the given * prime), it is reduced to F->max_depth + * After calling init, precomputations support DFTs of length up to 2**depth **/ // initialize with given root and given depth @@ -106,6 +107,11 @@ void n_fft_ctx_clear(n_fft_ctx_t F); + + + + + #ifdef __cplusplus } #endif diff --git a/src/n_fft/test/main.c b/src/n_fft/test/main.c new file mode 100644 index 0000000000..842bb0bdf9 --- /dev/null +++ b/src/n_fft/test/main.c @@ -0,0 +1,25 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +/* Include functions *********************************************************/ + +#include "t-init.c" + +/* Array of test functions ***************************************************/ + +test_struct tests[] = +{ + TEST_FUNCTION(n_fft_ctx_init2), +}; + +/* main function *************************************************************/ + +TEST_MAIN(tests) From 3fa79446c8fbfb38d4b3f058069a988fe6efcd57 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 00:55:06 +0200 Subject: [PATCH 13/71] dft, test passes --- src/n_fft.h | 5 + src/n_fft/dft.c | 472 +++++++++++++++++++++++++++++++++++++++++ src/n_fft/test/main.c | 2 + src/n_fft/test/t-dft.c | 123 +++++++++++ 4 files changed, 602 insertions(+) create mode 100644 src/n_fft/dft.c create mode 100644 src/n_fft/test/t-dft.c diff --git a/src/n_fft.h b/src/n_fft.h index 6bfa594192..5299f4d557 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -107,6 +107,11 @@ void n_fft_ctx_clear(n_fft_ctx_t F); +/** dft: + * transforms, inverse transforms, transposed transforms + * at length a power of 2 + */ +void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F); diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c new file mode 100644 index 0000000000..4cf8b5ed52 --- /dev/null +++ b/src/n_fft/dft.c @@ -0,0 +1,472 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "longlong.h" +#include "n_fft.h" + +/*---------*/ +/* helpers */ +/*---------*/ + +/** Shoup's modular multiplication with precomputation, lazy + * (does not perform the excess correction step) + * --> computes either r or r+n and store it is res, where r = (a*b) % n + * --> a_pr is the precomputation for n, p_hi and p_lo are temporaries + * --> requires nbits(n) < FLINT_BITS + */ +#define N_MULMOD_PRECOMP_LAZY(res, a, b, a_pr, n, p_hi, p_lo) \ + do { \ + umul_ppmm(p_hi, p_lo, (a_pr), (b)); \ + res = (a) * (b) - p_hi * (n); \ + } while(0) + +/*-------------*/ +/* 2-point DFT */ +/*-------------*/ + +/** Cooley-Tukey butterfly, node 0 + * * in [0..2n) x [0..2n) / out [0..2n) x [0..4n) / max < 4n + * * In-place transform + * [1 1] + * [a b] <- [a b] [1 -1] + * * n2 is 2*n, tmp is a temporary + */ +#define DFT2_NODE0_LAZY24(a, b, n2, tmp) \ + do { \ + tmp = (b); \ + (b) = (a) + (n2) - tmp; \ + (a) = (a) + tmp; \ + if ((a) >= (n2)) \ + (a) -= (n2); \ + } while(0) + + +/** Cooley-Tukey butterfly, general + * * in [0..4n) / out [0..4n) / max < 4n + * * In-place transform + * [1 1] + * [a b] <- [a b] [w -w] + * * n2 is 2*n, w_pr is the precomputed data for multiplication by w mod n + * p_hi, p_lo, u, v are temporaries + */ +#define DFT2_LAZY44(a, b, n, n2, w, w_pr, p_hi, p_lo, u, v) \ + do { \ + u = (a); \ + if (u >= (n2)) \ + u -= (n2); /* [0..2n) */ \ + v = (b); \ + N_MULMOD_PRECOMP_LAZY(v, w, v, w_pr, n, p_hi, p_lo); \ + (a) = u + v; /* [0..4n) */ \ + (b) = u + (n2) - v; /* [0..4n) */ \ + } while(0) + + +/*-------------*/ +/* 4-point DFT */ +/*-------------*/ + +/** 4-point DFT, node 0 + * * in [0..2n) / out [0..4n) / max < 4n + * * In-place transform + * [1 1 1 1] + * [1 -1 I -I] + * [a b c d] <- [a b c d] [1 1 -1 -1] + * [1 -1 -I I] + * * Corresponds to reducing down the tree with nodes + * x^4 - 1 + * / \ + * x^2 - 1 x^2 + 1 + * / \ / \ + * x - 1 x + 1 x - I x + I + * where I is typically a square root of -1 + * (but this property is not exploited) + */ +#define DFT4_NODE0_LAZY24(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ + do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v2; /* < 4*n */ \ + if (v4 >= (n2)) \ + v4 -= (n2); /* < 2*n */ \ + ulong v5 = v0 + (n2) - v2; /* < 4*n */ \ + if (v5 >= (n2)) \ + v5 -= (n2); /* < 2*n */ \ + ulong v6 = v1 + v3; /* < 4*n */ \ + if (v6 >= (n2)) \ + v6 -= (n2); /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n2) - v3, (I_pr), (n), \ + p_hi, p_lo); \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v4 + (n2) - v6; /* < 4*n */ \ + (c) = v5 + v7; /* < 4*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ + } while(0) + +/** 4-point DFT, general + * * in [0..4n) / out [0..4n) / max < 8n + * * In-place transform + * [ 1 1 1 1] + * [w2 -w2 w3 -w3] + * [a b c d] <- [a b c d] [w1 w1 -w1 -w1] + * [w1*w2 -w1*w2 -w1*w3 w1*w3] + * * Corresponds to reducing down the tree with nodes + * x^4 - w1**2 + * / \ + * x^2 - w1 x^2 + w1 + * / \ / \ + * x - w2 x + w2 x - w3 x + w3 + * typically w2**2 == w1 and w3 == I*w2 (so that w3**2 == -w1) so that this + * really is the subproduct tree built from the four roots + * w2, -w2, I*w2, -I*w2 of x**4 - w1 + */ +#define DFT4_LAZY44(a, b, c, d, \ + w1, w1_pr, w2, w2_pr, w3, w3_pr, \ + n, n2, p_hi, p_lo, tmp) \ +do { \ + ulong u0 = (a); \ + ulong u1 = (b); \ + ulong u2 = (c); \ + ulong u3 = (d); \ + if (u0 >= n2) \ + u0 -= n2; \ + if (u1 >= n2) \ + u1 -= n2; \ + \ + N_MULMOD_PRECOMP_LAZY(u2, w1, u2, w1_pr, n, p_hi, p_lo); \ + tmp = u0; \ + u0 = u0 + u2; /* [0..4n) */ \ + u2 = tmp + n2 - u2; /* [0..4n) */ \ + if (u0 >= n2) \ + u0 -= n2; /* [0..2n) */ \ + if (u2 >= n2) \ + u2 -= n2; /* [0..2n) */ \ + \ + N_MULMOD_PRECOMP_LAZY(u3, w1, u3, w1_pr, n, p_hi, p_lo); \ + tmp = u1; \ + u1 = u1 + u3; /* [0..8n) */ \ + u3 = tmp + n2 - u3; /* [0..8n) */ \ + \ + N_MULMOD_PRECOMP_LAZY(u1, w2, u1, w2_pr, n, p_hi, p_lo); \ + tmp = u0; \ + (a) = u0 + u1; /* [0..4n) */ \ + (b) = tmp + n2 - u1; /* [0..4n) */ \ + \ + N_MULMOD_PRECOMP_LAZY(u3, w3, u3, w3_pr, n, p_hi, p_lo); \ + tmp = u2; \ + (c) = u2 + u3; /* [0..4n) */ \ + (d) = tmp + n2 - u3; /* [0..4n) */ \ +} while(0) + +/*-------------*/ +/* 8-point DFT */ +/*-------------*/ + +/** 8-point DFT, node 0 + * * in [0..2n) / out [0..4n) / max < 8n + * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial + * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations + * p(1), p(-1), p(I), p(-I), p(J), p(-J), p(I*J), p(-I*J) + * at all 8-th roots of unity J**k for 0 <= k < 8 in bit-reversed order. + * * Recall [F->tab_w[2*k] for k in range(4)] == [1, I, J, IJ] + */ +FLINT_FORCE_INLINE void dft8_node0_lazy24(ulong * p0, ulong * p1, ulong * p2, ulong * p3, + ulong * p4, ulong * p5, ulong * p6, ulong * p7, + n_fft_ctx_t F) +{ + ulong p_hi, p_lo, tmp; + + DFT2_NODE0_LAZY24(*p0, *p4, F->mod2, tmp); + DFT2_NODE0_LAZY24(*p1, *p5, F->mod2, tmp); + DFT2_NODE0_LAZY24(*p2, *p6, F->mod2, tmp); + DFT2_NODE0_LAZY24(*p3, *p7, F->mod2, tmp); + + DFT4_NODE0_LAZY24(*p0, *p1, *p2, *p3, F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_LAZY44(*p4, *p5, *p6, *p7, + F->tab_w[2], F->tab_w[3], + F->tab_w[4], F->tab_w[5], + F->tab_w[6], F->tab_w[7], + F->mod, F->mod2, p_hi, p_lo, tmp); +} + +/** 8-point DFT + * * in [0..4n) / out [0..4n) / max < 8n + * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial + * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations + * p(w0), p(-w0), p(w1), p(-w1), p(w2), p(-w2), p(w3), p(-w3) + * where w_k = F->tab_w[8*node + 2*k] for 0 <= k < 4 + * * By construction these 8 evaluation points are the 8 roots of the + * polynomial x**8 - F->tab_w[node] + */ +FLINT_FORCE_INLINE void dft8_lazy44(ulong * p0, ulong * p1, ulong * p2, ulong * p3, + ulong * p4, ulong * p5, ulong * p6, ulong * p7, + ulong node, n_fft_ctx_t F) +{ + ulong p_hi, p_lo, u, v; + + const ulong w = F->tab_w[2*node]; + const ulong wpre = F->tab_w[2*node+1]; + + DFT2_LAZY44(*p0, *p4, F->mod, F->mod2, w, wpre, p_hi, p_lo, u, v); + DFT2_LAZY44(*p1, *p5, F->mod, F->mod2, w, wpre, p_hi, p_lo, u, v); + DFT2_LAZY44(*p2, *p6, F->mod, F->mod2, w, wpre, p_hi, p_lo, u, v); + DFT2_LAZY44(*p3, *p7, F->mod, F->mod2, w, wpre, p_hi, p_lo, u, v); + + DFT4_LAZY44(*p0, *p1, *p2, *p3, + F->tab_w[4*node], F->tab_w[4*node+1], + F->tab_w[8*node], F->tab_w[8*node+1], + F->tab_w[8*node+2], F->tab_w[8*node+3], + F->mod, F->mod2, p_hi, p_lo, u); + DFT4_LAZY44(*p4, *p5, *p6, *p7, + F->tab_w[4*node+2], F->tab_w[4*node+3], + F->tab_w[8*node+4], F->tab_w[8*node+5], + F->tab_w[8*node+6], F->tab_w[8*node+7], + F->mod, F->mod2, p_hi, p_lo, u); +} + +/*--------------*/ +/* 16-point DFT */ +/*--------------*/ + +// TODO doc for dft16 +// TODO simplify and bench other variants +// in [0..2n), out [0..4n), max value < 8n +FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, n_fft_ctx_t F) +{ + ulong p_hi, p_lo, tmp; + + DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p[0] >= F->mod2) + p[0] -= F->mod2; + DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p[1] >= F->mod2) + p[1] -= F->mod2; + DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p[2] >= F->mod2) + p[2] -= F->mod2; + DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p[3] >= F->mod2) + p[3] -= F->mod2; + + // next line requires < 2n, hence the four reductions above + DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_LAZY44(p[4], p[5], p[6], p[7], F->tab_w[2], F->tab_w[3], F->tab_w[4], F->tab_w[5], F->tab_w[6], F->tab_w[7], F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[8], p[9], p[10], p[11], F->tab_w[4], F->tab_w[5], F->tab_w[8], F->tab_w[9], F->tab_w[10], F->tab_w[11], F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[12], p[13], p[14], p[15], F->tab_w[6], F->tab_w[7], F->tab_w[12], F->tab_w[13], F->tab_w[14], F->tab_w[15], F->mod, F->mod2, p_hi, p_lo, tmp); +} + +// TODO doc for dft16 +// TODO simplify and bench other variants +FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) +{ + ulong p_hi, p_lo, tmp; + + ulong w2 = F->tab_w[2*node]; + ulong w2pre = F->tab_w[2*node+1]; + ulong w = F->tab_w[4*node]; + ulong wpre = F->tab_w[4*node+1]; + ulong Iw = F->tab_w[4*node+2]; + ulong Iwpre = F->tab_w[4*node+3]; + DFT4_LAZY44(p[0], p[4], p[8], p[12], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[1], p[5], p[9], p[13], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[2], p[6], p[10], p[14], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[3], p[7], p[11], p[15], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + + w2 = F->tab_w[8*node]; + w2pre = F->tab_w[8*node+1]; + w = F->tab_w[16*node]; + wpre = F->tab_w[16*node+1]; + Iw = F->tab_w[16*node+2]; + Iwpre = F->tab_w[16*node+3]; + DFT4_LAZY44(p[0], p[1], p[2], p[3], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + + w2 = F->tab_w[8*node+2]; + w2pre = F->tab_w[8*node+3]; + w = F->tab_w[16*node+4]; + wpre = F->tab_w[16*node+5]; + Iw = F->tab_w[16*node+6]; + Iwpre = F->tab_w[16*node+7]; + DFT4_LAZY44(p[4], p[5], p[6], p[7], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + + w2 = F->tab_w[8*node+4]; + w2pre = F->tab_w[8*node+5]; + w = F->tab_w[16*node+8]; + wpre = F->tab_w[16*node+9]; + Iw = F->tab_w[16*node+10]; + Iwpre = F->tab_w[16*node+11]; + DFT4_LAZY44(p[8], p[9], p[10], p[11], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + + w2 = F->tab_w[8*node+6]; + w2pre = F->tab_w[8*node+7]; + w = F->tab_w[16*node+12]; + wpre = F->tab_w[16*node+13]; + Iw = F->tab_w[16*node+14]; + Iwpre = F->tab_w[16*node+15]; + DFT4_LAZY44(p[12], p[13], p[14], p[15], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); +} + +/*--------------*/ +/* 32-point DFT */ +/*--------------*/ + + + + + + +/** 2**depth-point DFT + * * in [0..4n) / out [0..4n) / max < 8n + * * In-place transform p of length len == 2**depth into + * the concatenation of + * [sum(p[i] * w_k**i for i in range(len), sum(p[i] * (-w_k)**i for i in range(len)] + * for k in range(len), + * where w_k = F->tab_w[2**depth * node + 2*k] for 0 <= k < 2**(depth-1) + * * By construction these evaluation points are the roots of the + * polynomial x**len - F->tab_w[node] + * * Requirement (not checked): + * (node+1) * 2**depth <= 2**F.depth (length of F->tab_w) + */ +// TODO remove argument len +// TODO remove restriction to length >= 3 ? +void dft_lazy44(nn_ptr p, ulong len, ulong depth, ulong node, n_fft_ctx_t F) +{ + if (depth == 3) + dft8_lazy44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F); + else if (depth == 4) + dft16_lazy44(p, node, F); + //else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) + // dft32_red_lazy_general(p, node, F); + else + { + // in: [0..4n), out: [0..4n) + const nn_ptr p0 = p; + const nn_ptr p1 = p+len/4; + const nn_ptr p2 = p+2*len/4; + const nn_ptr p3 = p+3*len/4; + const ulong w2 = F->tab_w[2*node]; + const ulong w2pre = F->tab_w[2*node+1]; + const ulong w = F->tab_w[4*node]; + const ulong wpre = F->tab_w[4*node+1]; + const ulong Iw = F->tab_w[4*node+2]; + const ulong Iwpre = F->tab_w[4*node+3]; + ulong p_hi, p_lo, tmp; + + for (ulong k = 0; k < len/4; k+=4) + { + DFT4_LAZY44(p0[k+0], p1[k+0], p2[k+0], p3[k+0], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p0[k+1], p1[k+1], p2[k+1], p3[k+1], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p0[k+2], p1[k+2], p2[k+2], p3[k+2], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p0[k+3], p1[k+3], p2[k+3], p3[k+3], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + } + + dft_lazy44(p0, len/4, depth-2, 4*node, F); + dft_lazy44(p1, len/4, depth-2, 4*node+1, F); + dft_lazy44(p2, len/4, depth-2, 4*node+2, F); + dft_lazy44(p3, len/4, depth-2, 4*node+3, F); + } +} + +/** 2**depth-point DFT + * * in [0..2n) / out [0..4n) / max < 8n + * * In-place transform p of length len == 2**depth into + * the concatenation of + * [sum(p[i] * w_k**i for i in range(len), sum(p[i] * (-w_k)**i for i in range(len)] + * for k in range(len), + * where w_k = F->tab_w[2*k] for 0 <= k < 2**(depth-1) + * * By construction these evaluation points are the roots of the polynomial + * x**len - 1, precisely they are all powers of the chosen len-th primitive + * root of unity with exponents listed in bit reversed order + * * Requirement (not checked): depth <= F.depth + */ +void dft_node0_lazy24(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) +{ + // depth == 0: nothing to do + //if (depth == 1) + // // in [0..4n), out [0..4n) + // DFT2_LAZY4_RED(p[0], p[1], F->mod4); + //else if (depth == 2) + // // in [0..2n), out [0..4n) + // DFT4_LAZY2_RED(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); + //else + if (depth == 3) + dft8_node0_lazy24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); // in [0..2n), out [0..4n) + else if (depth == 4) + dft16_node0_lazy24(p, F); // in [0..2n), out [0..4n) + //else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) + // dft32_red_lazy(p, F); // in [0..2n), out [0..4n) + else + { + // input [0..2n) x [0..2n), output [0..2n) x [0..4n) + // (general accepts [0..4n) as input for depth >= 3) + const nn_ptr p0 = p; + const nn_ptr p1 = p + len/4; + const nn_ptr p2 = p + 2*len/4; + const nn_ptr p3 = p + 3*len/4; + ulong p_hi, p_lo; + for (ulong k = 0; k < len/4; k++) + { + DFT4_NODE0_LAZY24(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p[k] >= F->mod2) + p[k] -= F->mod2; + } + dft_node0_lazy24(p0, len/4, depth-2, F); + dft_lazy44(p1, len/4, depth-2, 1, F); + dft_lazy44(p2, len/4, depth-2, 2, F); + dft_lazy44(p3, len/4, depth-2, 3, F); + } +} + +// TODO try lazier variant for entry point < n, to see if any gain +void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) +{ + // depth == 0: nothing to do + //if (depth == 1) + // // in [0..4n), out [0..4n) + // DFT2_LAZY4_RED(p[0], p[1], F->mod4); + //else if (depth == 2) + // // in [0..2n), out [0..4n) + // DFT4_LAZY2_RED(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); + //else + if (depth == 3) + dft8_node0_lazy24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); // in [0..2n), out [0..4n) + else if (depth == 4) + dft16_node0_lazy24(p, F); // in [0..2n), out [0..4n) + //else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) + // dft32_red_lazy(p, F); // in [0..2n), out [0..4n) + else + { + // input [0..2n) x [0..2n), output [0..2n) x [0..4n) + // (general accepts [0..4n) as input for depth >= 3) + const nn_ptr p0 = p; + const nn_ptr p1 = p + len/4; + const nn_ptr p2 = p + 2*len/4; + const nn_ptr p3 = p + 3*len/4; + ulong p_hi, p_lo; + for (ulong k = 0; k < len/4; k++) + { + DFT4_NODE0_LAZY24(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p[k] >= F->mod2) + p[k] -= F->mod2; + } + dft_node0_lazy24(p0, len/4, depth-2, F); + dft_lazy44(p1, len/4, depth-2, 1, F); + dft_lazy44(p2, len/4, depth-2, 2, F); + dft_lazy44(p3, len/4, depth-2, 3, F); + } +} + + + + + + diff --git a/src/n_fft/test/main.c b/src/n_fft/test/main.c index 842bb0bdf9..a03cd0faa0 100644 --- a/src/n_fft/test/main.c +++ b/src/n_fft/test/main.c @@ -12,12 +12,14 @@ /* Include functions *********************************************************/ #include "t-init.c" +#include "t-dft.c" /* Array of test functions ***************************************************/ test_struct tests[] = { TEST_FUNCTION(n_fft_ctx_init2), + TEST_FUNCTION(n_fft_dft), }; /* main function *************************************************************/ diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c new file mode 100644 index 0000000000..a4c12ce2d0 --- /dev/null +++ b/src/n_fft/test/t-dft.c @@ -0,0 +1,123 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "ulong_extras.h" +#include "nmod.h" +#include "nmod_poly.h" +#include "nmod_vec.h" +#include "n_fft.h" + +#define MAX_EVAL_DEPTH 10 + +// vector equality up to reduction mod +static int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, nmod_t mod) +{ + for (ulong k = 0; k < len; k++) + { + ulong v1; + ulong v2; + NMOD_RED(v1, vec1[k], mod); + NMOD_RED(v2, vec2[k], mod); + if (v1 != v2) + return 0; + } + + return 1; +} + +// testing that all elements of "vec" are less than "bound" +static int nmod_vec_range(nn_srcptr vec, ulong len, ulong bound) +{ + for (ulong k = 0; k < len; k++) + if (vec[k] >= bound) + return 0; + + return 1; +} + + +TEST_FUNCTION_START(n_fft_dft, state) +{ + int i; + + for (i = 0; i < 100 * flint_test_multiplier(); i++) + { + // take some FFT prime p with max_depth >= 12 + ulong max_depth = 12 + n_randint(state, 10); + ulong p = 1 + (UWORD(1) << max_depth); + while (! n_is_prime(p)) + p += (UWORD(1) << max_depth); + max_depth = flint_ctz(p-1); + + nmod_t mod; + nmod_init(&mod, p); + + // init FFT root tables + n_fft_ctx_t F; + n_fft_ctx_init2(F, MAX_EVAL_DEPTH, p); + + for (ulong depth = 3; depth <= MAX_EVAL_DEPTH; depth++) + { + const ulong len = (1UL<tab_w[2*k]; + evals_br[2*k] = nmod_poly_evaluate_nmod(pol, point); + evals_br[2*k+1] = nmod_poly_evaluate_nmod(pol, nmod_neg(point, mod)); + } + + // evals by DFT + ulong * p = _nmod_vec_init(len); + _nmod_vec_set(p, pol->coeffs, len); + + n_fft_dft(p, len, depth, F); + + int res = nmod_vec_red_equal(evals_br, p, len, mod); + + if (!res) + TEST_FUNCTION_FAIL( + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "failed equality test\n", + p, F->tab_w2[2*(max_depth-2)], max_depth, depth); + + res = nmod_vec_range(p, len, 4*mod.n); + + if (!res) + TEST_FUNCTION_FAIL( + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "failed range test\n", + p, F->tab_w2[2*(max_depth-2)], max_depth, depth); + + _nmod_vec_clear(p); + nmod_poly_clear(pol); + _nmod_vec_clear(evals_br); + } + + n_fft_ctx_clear(F); + } + + TEST_FUNCTION_END(state); +} From ff335337bba8a0c598820a12ba7eeeb7c90dc295 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 01:04:01 +0200 Subject: [PATCH 14/71] add profile --- src/n_fft/profile/p-dft.c | 143 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 src/n_fft/profile/p-dft.c diff --git a/src/n_fft/profile/p-dft.c b/src/n_fft/profile/p-dft.c new file mode 100644 index 0000000000..da6716ec34 --- /dev/null +++ b/src/n_fft/profile/p-dft.c @@ -0,0 +1,143 @@ +#include "profiler.h" +#include "nmod_vec.h" +#include "fft_small.h" +#include "n_fft.h" + +#define num_primes 5 + +typedef struct +{ + ulong prime; + ulong depth; + ulong maxdepth; + ulong stride; +} info_t; + +#define SAMPLE(fun, _variant) \ +void sample_##fun##_variant(void * arg, ulong count) \ +{ \ + info_t * info = (info_t *) arg; \ + const ulong p = info->prime; \ + const ulong depth = info->depth; \ + const ulong maxdepth = info->maxdepth; \ + const ulong stride = info->stride; \ + \ + const ulong len = stride * (UWORD(1) << depth); \ + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); \ + \ + /* modulus, roots of unity */ \ + nmod_t mod; \ + nmod_init(&mod, p); \ + ulong w0 = nmod_pow_ui(n_primitive_root_prime(p), (p - 1) >> maxdepth, mod); \ + ulong w = nmod_pow_ui(w0, 1UL<<(maxdepth - depth), mod); \ + n_fft_ctx_t F; \ + n_fft_ctx_init2_root(F, w, depth, depth, p); \ + \ + FLINT_TEST_INIT(state); \ + \ + ulong * coeffs = _nmod_vec_init(len); \ + _nmod_vec_randtest(coeffs, state, len, mod); \ + \ + for (ulong i = 0; i < count; i++) \ + { \ + prof_start(); \ + for (ulong j = 0; j < rep; j++) \ + n_fft_##fun##_variant(coeffs, len, depth, F); \ + prof_stop(); \ + } \ + \ + n_fft_ctx_clear(F); \ + FLINT_TEST_CLEAR(state); \ +} \ + +SAMPLE(dft, ) +//SAMPLE(n_fft_dft, _stride) + +void sample_sd_fft(void * arg, ulong count) +{ + info_t * info = (info_t *) arg; + const ulong p = info->prime; + const ulong depth = info->depth; + + const ulong len = UWORD(1) << depth; + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); + + sd_fft_ctx_t Q; + sd_fft_ctx_init_prime(Q, p); + sd_fft_ctx_fit_depth(Q, depth); + + ulong sz = sd_fft_ctx_data_size(depth)*sizeof(double); + + FLINT_TEST_INIT(state); + + nmod_t mod; + nmod_init(&mod, p); + ulong * coeffs = _nmod_vec_init(len); + _nmod_vec_randtest(coeffs, state, len, mod); + + double* data = flint_aligned_alloc(4096, n_round_up(sz, 4096)); + for (ulong i = 0; i < len; i++) + data[i] = coeffs[i]; + + for (ulong i = 0; i < count; i++) + { + prof_start(); + for (ulong j = 0; j < rep; j++) + sd_fft_trunc(Q, data, depth, len, len); + prof_stop(); + } + + sd_fft_ctx_clear(Q); + FLINT_TEST_CLEAR(state); +} + + +int main() +{ + flint_printf("- depth is log(fft length)\n"); + flint_printf("- timing DFT (length power of 2) for several bit lengths and depths\n"); + flint_printf("depth\tsd_fft\trec4\n"); + + // FIXME FLINT_BITS issue + ulong primes[num_primes] = { + 786433, // 20 bits, 1 + 2**18 * 3 + 2013265921, // 31 bits, 1 + 2**27 * 3 * 5 + 2748779069441, // 42 bits, 1 + 2**39 * 5 + 1108307720798209, // 50 bits, 1 + 2**44 * 3**2 * 7 + 1139410705724735489, // 60 bits, 1 + 2**52 * 11 * 23 + }; + ulong max_depths[num_primes] = { 18, 25, 25, 25, 25 }; + + for (ulong k = 3; k < 4; k++) + { + for (ulong depth = 3; depth <= max_depths[k]; depth++) + { + printf("%ld\t", depth); + + info_t info; + info.prime = primes[k]; + info.maxdepth = max_depths[k]; + info.depth = depth; + info.stride = 1; + + const ulong len = UWORD(1) << depth; + const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); + + double min[15]; + double max; + + prof_repeat(min+0, &max, sample_sd_fft, (void *) &info); + prof_repeat(min+1, &max, sample_dft, (void *) &info); + + flint_printf("%.1e\t%.1e\t\n", + min[0]/(double)1000000/rep, + min[1]/(double)1000000/rep + ); + } + } + return 0; +} + +/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s + From f4520c963bffec8765d8878a3eaf9c2e72f293a1 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 10:12:42 +0200 Subject: [PATCH 15/71] clean things a bit --- src/n_fft/dft.c | 122 ++++++++++++++++++++++++++++---------- src/n_fft/profile/p-dft.c | 1 - src/n_fft/test/t-dft.c | 4 +- 3 files changed, 92 insertions(+), 35 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 4cf8b5ed52..87e04f0b7b 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -9,7 +9,7 @@ (at your option) any later version. See . */ -#include "longlong.h" +#include "longlong_asm_gcc.h" // TODO change to longlong #include "n_fft.h" /*---------*/ @@ -191,7 +191,9 @@ FLINT_FORCE_INLINE void dft8_node0_lazy24(ulong * p0, ulong * p1, ulong * p2, ul DFT2_NODE0_LAZY24(*p2, *p6, F->mod2, tmp); DFT2_NODE0_LAZY24(*p3, *p7, F->mod2, tmp); - DFT4_NODE0_LAZY24(*p0, *p1, *p2, *p3, F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(*p0, *p1, *p2, *p3, + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); DFT4_LAZY44(*p4, *p5, *p6, *p7, F->tab_w[2], F->tab_w[3], F->tab_w[4], F->tab_w[5], @@ -215,18 +217,18 @@ FLINT_FORCE_INLINE void dft8_lazy44(ulong * p0, ulong * p1, ulong * p2, ulong * ulong p_hi, p_lo, u, v; const ulong w = F->tab_w[2*node]; - const ulong wpre = F->tab_w[2*node+1]; - - DFT2_LAZY44(*p0, *p4, F->mod, F->mod2, w, wpre, p_hi, p_lo, u, v); - DFT2_LAZY44(*p1, *p5, F->mod, F->mod2, w, wpre, p_hi, p_lo, u, v); - DFT2_LAZY44(*p2, *p6, F->mod, F->mod2, w, wpre, p_hi, p_lo, u, v); - DFT2_LAZY44(*p3, *p7, F->mod, F->mod2, w, wpre, p_hi, p_lo, u, v); + const ulong w_pr = F->tab_w[2*node+1]; + DFT2_LAZY44(*p0, *p4, F->mod, F->mod2, w, w_pr, p_hi, p_lo, u, v); + DFT2_LAZY44(*p1, *p5, F->mod, F->mod2, w, w_pr, p_hi, p_lo, u, v); + DFT2_LAZY44(*p2, *p6, F->mod, F->mod2, w, w_pr, p_hi, p_lo, u, v); + DFT2_LAZY44(*p3, *p7, F->mod, F->mod2, w, w_pr, p_hi, p_lo, u, v); DFT4_LAZY44(*p0, *p1, *p2, *p3, F->tab_w[4*node], F->tab_w[4*node+1], F->tab_w[8*node], F->tab_w[8*node+1], F->tab_w[8*node+2], F->tab_w[8*node+3], F->mod, F->mod2, p_hi, p_lo, u); + DFT4_LAZY44(*p4, *p5, *p6, *p7, F->tab_w[4*node+2], F->tab_w[4*node+3], F->tab_w[8*node+4], F->tab_w[8*node+5], @@ -245,42 +247,82 @@ FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, n_fft_ctx_t F) { ulong p_hi, p_lo, tmp; - DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); if (p[0] >= F->mod2) p[0] -= F->mod2; - DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); if (p[1] >= F->mod2) p[1] -= F->mod2; - DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); if (p[2] >= F->mod2) p[2] -= F->mod2; - DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); if (p[3] >= F->mod2) p[3] -= F->mod2; // next line requires < 2n, hence the four reductions above - DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - DFT4_LAZY44(p[4], p[5], p[6], p[7], F->tab_w[2], F->tab_w[3], F->tab_w[4], F->tab_w[5], F->tab_w[6], F->tab_w[7], F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[8], p[9], p[10], p[11], F->tab_w[4], F->tab_w[5], F->tab_w[8], F->tab_w[9], F->tab_w[10], F->tab_w[11], F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[12], p[13], p[14], p[15], F->tab_w[6], F->tab_w[7], F->tab_w[12], F->tab_w[13], F->tab_w[14], F->tab_w[15], F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + DFT4_LAZY44(p[4], p[5], p[6], p[7], + F->tab_w[2], F->tab_w[3], + F->tab_w[4], F->tab_w[5], + F->tab_w[6], F->tab_w[7], + F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[8], p[9], p[10], p[11], + F->tab_w[4], F->tab_w[5], + F->tab_w[8], F->tab_w[9], + F->tab_w[10], F->tab_w[11], + F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[12], p[13], p[14], p[15], + F->tab_w[6], F->tab_w[7], + F->tab_w[12], F->tab_w[13], + F->tab_w[14], F->tab_w[15], + F->mod, F->mod2, p_hi, p_lo, tmp); } // TODO doc for dft16 -// TODO simplify and bench other variants +// TODO bench other variants FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) { ulong p_hi, p_lo, tmp; - - ulong w2 = F->tab_w[2*node]; - ulong w2pre = F->tab_w[2*node+1]; - ulong w = F->tab_w[4*node]; - ulong wpre = F->tab_w[4*node+1]; - ulong Iw = F->tab_w[4*node+2]; - ulong Iwpre = F->tab_w[4*node+3]; - DFT4_LAZY44(p[0], p[4], p[8], p[12], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[1], p[5], p[9], p[13], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[2], p[6], p[10], p[14], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[3], p[7], p[11], p[15], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + ulong w2, w2pre, w, wpre, Iw, Iwpre; + + w2 = F->tab_w[2*node]; + w2pre = F->tab_w[2*node+1]; + w = F->tab_w[4*node]; + wpre = F->tab_w[4*node+1]; + Iw = F->tab_w[4*node+2]; + Iwpre = F->tab_w[4*node+3]; + + DFT4_LAZY44(p[0], p[4], p[8], p[12], + w2, w2pre, + w, wpre, + Iw, Iwpre, + F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[1], p[5], p[9], p[13], + w2, w2pre, + w, wpre, + Iw, Iwpre, + F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[2], p[6], p[10], p[14], + w2, w2pre, + w, wpre, + Iw, Iwpre, + F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[3], p[7], p[11], p[15], + w2, w2pre, + w, wpre, + Iw, Iwpre, + F->mod, F->mod2, p_hi, p_lo, tmp); w2 = F->tab_w[8*node]; w2pre = F->tab_w[8*node+1]; @@ -288,7 +330,11 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) wpre = F->tab_w[16*node+1]; Iw = F->tab_w[16*node+2]; Iwpre = F->tab_w[16*node+3]; - DFT4_LAZY44(p[0], p[1], p[2], p[3], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[0], p[1], p[2], p[3], + w2, w2pre, + w, wpre, + Iw, Iwpre, + F->mod, F->mod2, p_hi, p_lo, tmp); w2 = F->tab_w[8*node+2]; w2pre = F->tab_w[8*node+3]; @@ -296,7 +342,11 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) wpre = F->tab_w[16*node+5]; Iw = F->tab_w[16*node+6]; Iwpre = F->tab_w[16*node+7]; - DFT4_LAZY44(p[4], p[5], p[6], p[7], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[4], p[5], p[6], p[7], + w2, w2pre, + w, wpre, + Iw, Iwpre, + F->mod, F->mod2, p_hi, p_lo, tmp); w2 = F->tab_w[8*node+4]; w2pre = F->tab_w[8*node+5]; @@ -304,7 +354,11 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) wpre = F->tab_w[16*node+9]; Iw = F->tab_w[16*node+10]; Iwpre = F->tab_w[16*node+11]; - DFT4_LAZY44(p[8], p[9], p[10], p[11], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[8], p[9], p[10], p[11], + w2, w2pre, + w, wpre, + Iw, Iwpre, + F->mod, F->mod2, p_hi, p_lo, tmp); w2 = F->tab_w[8*node+6]; w2pre = F->tab_w[8*node+7]; @@ -312,7 +366,11 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) wpre = F->tab_w[16*node+13]; Iw = F->tab_w[16*node+14]; Iwpre = F->tab_w[16*node+15]; - DFT4_LAZY44(p[12], p[13], p[14], p[15], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[12], p[13], p[14], p[15], + w2, w2pre, + w, wpre, + Iw, Iwpre, + F->mod, F->mod2, p_hi, p_lo, tmp); } /*--------------*/ diff --git a/src/n_fft/profile/p-dft.c b/src/n_fft/profile/p-dft.c index da6716ec34..1ec5540648 100644 --- a/src/n_fft/profile/p-dft.c +++ b/src/n_fft/profile/p-dft.c @@ -91,7 +91,6 @@ void sample_sd_fft(void * arg, ulong count) FLINT_TEST_CLEAR(state); } - int main() { flint_printf("- depth is log(fft length)\n"); diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c index a4c12ce2d0..4ca33dc933 100644 --- a/src/n_fft/test/t-dft.c +++ b/src/n_fft/test/t-dft.c @@ -19,7 +19,7 @@ #define MAX_EVAL_DEPTH 10 // vector equality up to reduction mod -static int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, nmod_t mod) +static inline int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, nmod_t mod) { for (ulong k = 0; k < len; k++) { @@ -35,7 +35,7 @@ static int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, nmod_t } // testing that all elements of "vec" are less than "bound" -static int nmod_vec_range(nn_srcptr vec, ulong len, ulong bound) +static inline int nmod_vec_range(nn_srcptr vec, ulong len, ulong bound) { for (ulong k = 0; k < len; k++) if (vec[k] >= bound) From e10c29c37d8bba7378d1d45637dd9260e01ee06e Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 10:16:21 +0200 Subject: [PATCH 16/71] introducing dft32 base case --- src/n_fft/dft.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 87e04f0b7b..207fcc348d 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -377,10 +377,90 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) /* 32-point DFT */ /*--------------*/ +// in [0..2n), out [0..4n), max value < 8n +FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, n_fft_ctx_t F) +{ + ulong p_hi, p_lo; + + DFT4_NODE0_LAZY24(p[0], p[8 ], p[16], p[24], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[0] >= F->mod2) + p[0] -= F->mod2; + DFT4_NODE0_LAZY24(p[1], p[9 ], p[17], p[25], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[1] >= F->mod2) + p[1] -= F->mod2; + DFT4_NODE0_LAZY24(p[2], p[10], p[18], p[26], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[2] >= F->mod2) + p[2] -= F->mod2; + DFT4_NODE0_LAZY24(p[3], p[11], p[19], p[27], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[3] >= F->mod2) + p[3] -= F->mod2; + DFT4_NODE0_LAZY24(p[4], p[12], p[20], p[28], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[4] >= F->mod2) + p[4] -= F->mod2; + DFT4_NODE0_LAZY24(p[5], p[13], p[21], p[29], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[5] >= F->mod2) + p[5] -= F->mod2; + DFT4_NODE0_LAZY24(p[6], p[14], p[22], p[30], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[6] >= F->mod2) + p[6] -= F->mod2; + DFT4_NODE0_LAZY24(p[7], p[15], p[23], p[31], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[7] >= F->mod2) + p[7] -= F->mod2; + + // next line requires < 2n, hence the 8 reductions above + dft8_red_lazy( p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, F); + dft8_red_lazy_general(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, F); + dft8_red_lazy_general(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, F); + dft8_red_lazy_general(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F); +} +// in [0..2n), out [0..4n), max value < 8n +FLINT_FORCE_INLINE void dft32_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) +{ + ulong p_hi, p_lo, tmp; + ulong w2 = F->tab_w[2*node]; + ulong w2pre = F->tab_w[2*node+1]; + ulong w = F->tab_w[4*node]; + ulong wpre = F->tab_w[4*node+1]; + ulong Iw = F->tab_w[4*node+2]; + ulong Iwpre = F->tab_w[4*node+3]; + DFT4_LAZY(p[0], p[ 8], p[16], p[24], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY(p[1], p[ 9], p[17], p[25], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY(p[2], p[10], p[18], p[26], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY(p[3], p[11], p[19], p[27], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY(p[4], p[12], p[20], p[28], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY(p[5], p[13], p[21], p[29], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY(p[6], p[14], p[22], p[30], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY(p[7], p[15], p[23], p[31], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + // next line requires < 2n, hence the four reductions above + dft8_red_lazy_general(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, 4*node, F); + dft8_red_lazy_general(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 4*node+1, F); + dft8_red_lazy_general(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 4*node+2, F); + dft8_red_lazy_general(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 4*node+3, F); +} + +/*-------------*/ +/* general DFT */ +/*-------------*/ /** 2**depth-point DFT * * in [0..4n) / out [0..4n) / max < 8n From 7b605a67d6eb5e9326f0cde6dc3799c629b429d4 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 10:21:15 +0200 Subject: [PATCH 17/71] dft32 base case --- src/n_fft/dft.c | 114 +++++++++++++++--------------------------------- 1 file changed, 35 insertions(+), 79 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 207fcc348d..83f2f8aacc 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -191,9 +191,7 @@ FLINT_FORCE_INLINE void dft8_node0_lazy24(ulong * p0, ulong * p1, ulong * p2, ul DFT2_NODE0_LAZY24(*p2, *p6, F->mod2, tmp); DFT2_NODE0_LAZY24(*p3, *p7, F->mod2, tmp); - DFT4_NODE0_LAZY24(*p0, *p1, *p2, *p3, - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(*p0, *p1, *p2, *p3, F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); DFT4_LAZY44(*p4, *p5, *p6, *p7, F->tab_w[2], F->tab_w[3], F->tab_w[4], F->tab_w[5], @@ -247,31 +245,21 @@ FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, n_fft_ctx_t F) { ulong p_hi, p_lo, tmp; - DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[0] >= F->mod2) p[0] -= F->mod2; - DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[1] >= F->mod2) p[1] -= F->mod2; - DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[2] >= F->mod2) p[2] -= F->mod2; - DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[3] >= F->mod2) p[3] -= F->mod2; // next line requires < 2n, hence the four reductions above - DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); DFT4_LAZY44(p[4], p[5], p[6], p[7], F->tab_w[2], F->tab_w[3], F->tab_w[4], F->tab_w[5], @@ -303,26 +291,10 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) Iw = F->tab_w[4*node+2]; Iwpre = F->tab_w[4*node+3]; - DFT4_LAZY44(p[0], p[4], p[8], p[12], - w2, w2pre, - w, wpre, - Iw, Iwpre, - F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[1], p[5], p[9], p[13], - w2, w2pre, - w, wpre, - Iw, Iwpre, - F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[2], p[6], p[10], p[14], - w2, w2pre, - w, wpre, - Iw, Iwpre, - F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[3], p[7], p[11], p[15], - w2, w2pre, - w, wpre, - Iw, Iwpre, - F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[0], p[4], p[ 8], p[12], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[1], p[5], p[ 9], p[13], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[2], p[6], p[10], p[14], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[3], p[7], p[11], p[15], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); w2 = F->tab_w[8*node]; w2pre = F->tab_w[8*node+1]; @@ -330,11 +302,7 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) wpre = F->tab_w[16*node+1]; Iw = F->tab_w[16*node+2]; Iwpre = F->tab_w[16*node+3]; - DFT4_LAZY44(p[0], p[1], p[2], p[3], - w2, w2pre, - w, wpre, - Iw, Iwpre, - F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[0], p[1], p[2], p[3], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); w2 = F->tab_w[8*node+2]; w2pre = F->tab_w[8*node+3]; @@ -342,11 +310,7 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) wpre = F->tab_w[16*node+5]; Iw = F->tab_w[16*node+6]; Iwpre = F->tab_w[16*node+7]; - DFT4_LAZY44(p[4], p[5], p[6], p[7], - w2, w2pre, - w, wpre, - Iw, Iwpre, - F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[4], p[5], p[6], p[7], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); w2 = F->tab_w[8*node+4]; w2pre = F->tab_w[8*node+5]; @@ -354,11 +318,7 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) wpre = F->tab_w[16*node+9]; Iw = F->tab_w[16*node+10]; Iwpre = F->tab_w[16*node+11]; - DFT4_LAZY44(p[8], p[9], p[10], p[11], - w2, w2pre, - w, wpre, - Iw, Iwpre, - F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[8], p[9], p[10], p[11], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); w2 = F->tab_w[8*node+6]; w2pre = F->tab_w[8*node+7]; @@ -366,11 +326,7 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) wpre = F->tab_w[16*node+13]; Iw = F->tab_w[16*node+14]; Iwpre = F->tab_w[16*node+15]; - DFT4_LAZY44(p[12], p[13], p[14], p[15], - w2, w2pre, - w, wpre, - Iw, Iwpre, - F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[12], p[13], p[14], p[15], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); } /*--------------*/ @@ -424,10 +380,10 @@ FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, n_fft_ctx_t F) p[7] -= F->mod2; // next line requires < 2n, hence the 8 reductions above - dft8_red_lazy( p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, F); - dft8_red_lazy_general(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, F); - dft8_red_lazy_general(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, F); - dft8_red_lazy_general(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F); + dft8_node0_lazy24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, F); + dft8_lazy44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, F); + dft8_lazy44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, F); + dft8_lazy44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F); } @@ -442,20 +398,20 @@ FLINT_FORCE_INLINE void dft32_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) ulong wpre = F->tab_w[4*node+1]; ulong Iw = F->tab_w[4*node+2]; ulong Iwpre = F->tab_w[4*node+3]; - DFT4_LAZY(p[0], p[ 8], p[16], p[24], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY(p[1], p[ 9], p[17], p[25], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY(p[2], p[10], p[18], p[26], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY(p[3], p[11], p[19], p[27], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY(p[4], p[12], p[20], p[28], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY(p[5], p[13], p[21], p[29], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY(p[6], p[14], p[22], p[30], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY(p[7], p[15], p[23], p[31], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[0], p[ 8], p[16], p[24], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[1], p[ 9], p[17], p[25], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[2], p[10], p[18], p[26], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[3], p[11], p[19], p[27], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[4], p[12], p[20], p[28], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[5], p[13], p[21], p[29], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[6], p[14], p[22], p[30], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[7], p[15], p[23], p[31], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); // next line requires < 2n, hence the four reductions above - dft8_red_lazy_general(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, 4*node, F); - dft8_red_lazy_general(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 4*node+1, F); - dft8_red_lazy_general(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 4*node+2, F); - dft8_red_lazy_general(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 4*node+3, F); + dft8_lazy44(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, 4*node, F); + dft8_lazy44(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 4*node+1, F); + dft8_lazy44(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 4*node+2, F); + dft8_lazy44(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 4*node+3, F); } /*-------------*/ @@ -482,8 +438,8 @@ void dft_lazy44(nn_ptr p, ulong len, ulong depth, ulong node, n_fft_ctx_t F) dft8_lazy44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F); else if (depth == 4) dft16_lazy44(p, node, F); - //else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) - // dft32_red_lazy_general(p, node, F); + else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) + dft32_lazy44(p, node, F); else { // in: [0..4n), out: [0..4n) @@ -540,8 +496,8 @@ void dft_node0_lazy24(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) dft8_node0_lazy24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); // in [0..2n), out [0..4n) else if (depth == 4) dft16_node0_lazy24(p, F); // in [0..2n), out [0..4n) - //else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) - // dft32_red_lazy(p, F); // in [0..2n), out [0..4n) + else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) + dft32_node0_lazy24(p, F); // in [0..2n), out [0..4n) else { // input [0..2n) x [0..2n), output [0..2n) x [0..4n) @@ -580,7 +536,7 @@ void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) else if (depth == 4) dft16_node0_lazy24(p, F); // in [0..2n), out [0..4n) //else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) - // dft32_red_lazy(p, F); // in [0..2n), out [0..4n) + // dft32_node0_lazy24(p, F); // in [0..2n), out [0..4n) else { // input [0..2n) x [0..2n), output [0..2n) x [0..4n) From 1f236d8467b4a2ea0aadd7737ddadc71beb111ea Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 11:04:59 +0200 Subject: [PATCH 18/71] cleaning things --- src/n_fft/dft.c | 89 +++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 39 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 83f2f8aacc..81ffe9804b 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -238,9 +238,10 @@ FLINT_FORCE_INLINE void dft8_lazy44(ulong * p0, ulong * p1, ulong * p2, ulong * /* 16-point DFT */ /*--------------*/ -// TODO doc for dft16 -// TODO simplify and bench other variants -// in [0..2n), out [0..4n), max value < 8n +/** 16-point DFT, node 0 + * * in [0..2n) / out [0..4n) / max < 8n + * * Same specification as dft_node0_lazy24, for depth==4 + */ FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, n_fft_ctx_t F) { ulong p_hi, p_lo, tmp; @@ -277,8 +278,10 @@ FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, n_fft_ctx_t F) F->mod, F->mod2, p_hi, p_lo, tmp); } -// TODO doc for dft16 -// TODO bench other variants +/** 16-point DFT + * * in [0..4n) / out [0..4n) / max < 8n + * * Same specification as dft_lazy44, for depth==4 + */ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) { ulong p_hi, p_lo, tmp; @@ -333,7 +336,10 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) /* 32-point DFT */ /*--------------*/ -// in [0..2n), out [0..4n), max value < 8n +/** 32-point DFT, node 0 + * * in [0..2n) / out [0..4n) / max < 8n + * * Same specification as dft_node0_lazy24, for depth==5 + */ FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, n_fft_ctx_t F) { ulong p_hi, p_lo; @@ -386,8 +392,10 @@ FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, n_fft_ctx_t F) dft8_lazy44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F); } - -// in [0..2n), out [0..4n), max value < 8n +/** 32-point DFT + * * in [0..4n) / out [0..4n) / max < 8n + * * Same specification as dft_lazy44, for depth==5 + */ FLINT_FORCE_INLINE void dft32_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) { ulong p_hi, p_lo, tmp; @@ -428,20 +436,21 @@ FLINT_FORCE_INLINE void dft32_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) * * By construction these evaluation points are the roots of the * polynomial x**len - F->tab_w[node] * * Requirement (not checked): + * 3 <= depth * (node+1) * 2**depth <= 2**F.depth (length of F->tab_w) */ // TODO remove argument len -// TODO remove restriction to length >= 3 ? void dft_lazy44(nn_ptr p, ulong len, ulong depth, ulong node, n_fft_ctx_t F) { if (depth == 3) dft8_lazy44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F); else if (depth == 4) dft16_lazy44(p, node, F); - else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) + else if (depth == 5) dft32_lazy44(p, node, F); else { + // 4-point butterflies // in: [0..4n), out: [0..4n) const nn_ptr p0 = p; const nn_ptr p1 = p+len/4; @@ -463,6 +472,7 @@ void dft_lazy44(nn_ptr p, ulong len, ulong depth, ulong node, n_fft_ctx_t F) DFT4_LAZY44(p0[k+3], p1[k+3], p2[k+3], p3[k+3], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); } + // 4 recursive calls with depth-2 dft_lazy44(p0, len/4, depth-2, 4*node, F); dft_lazy44(p1, len/4, depth-2, 4*node+1, F); dft_lazy44(p2, len/4, depth-2, 4*node+2, F); @@ -480,18 +490,10 @@ void dft_lazy44(nn_ptr p, ulong len, ulong depth, ulong node, n_fft_ctx_t F) * * By construction these evaluation points are the roots of the polynomial * x**len - 1, precisely they are all powers of the chosen len-th primitive * root of unity with exponents listed in bit reversed order - * * Requirement (not checked): depth <= F.depth + * * Requirements (not checked): 3 <= depth <= F.depth */ void dft_node0_lazy24(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) { - // depth == 0: nothing to do - //if (depth == 1) - // // in [0..4n), out [0..4n) - // DFT2_LAZY4_RED(p[0], p[1], F->mod4); - //else if (depth == 2) - // // in [0..2n), out [0..4n) - // DFT4_LAZY2_RED(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); - //else if (depth == 3) dft8_node0_lazy24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); // in [0..2n), out [0..4n) else if (depth == 4) @@ -500,8 +502,9 @@ void dft_node0_lazy24(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) dft32_node0_lazy24(p, F); // in [0..2n), out [0..4n) else { - // input [0..2n) x [0..2n), output [0..2n) x [0..4n) - // (general accepts [0..4n) as input for depth >= 3) + // 4-point butterflies + // input p0,p1,p2,p3 in [0..2n) x [0..2n) x [0..2n) x [0..2n) + // output p0,p1,p2,p3 in [0..2n) x [0..4n) x [0..4n) x [0..4n) const nn_ptr p0 = p; const nn_ptr p1 = p + len/4; const nn_ptr p2 = p + 2*len/4; @@ -510,9 +513,11 @@ void dft_node0_lazy24(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) for (ulong k = 0; k < len/4; k++) { DFT4_NODE0_LAZY24(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p[k] >= F->mod2) - p[k] -= F->mod2; + if (p0[k] >= F->mod2) + p0[k] -= F->mod2; } + + // 4 recursive calls with depth-2 dft_node0_lazy24(p0, len/4, depth-2, F); dft_lazy44(p1, len/4, depth-2, 1, F); dft_lazy44(p2, len/4, depth-2, 2, F); @@ -524,23 +529,27 @@ void dft_node0_lazy24(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) { // depth == 0: nothing to do - //if (depth == 1) - // // in [0..4n), out [0..4n) - // DFT2_LAZY4_RED(p[0], p[1], F->mod4); - //else if (depth == 2) - // // in [0..2n), out [0..4n) - // DFT4_LAZY2_RED(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2); - //else - if (depth == 3) - dft8_node0_lazy24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); // in [0..2n), out [0..4n) + if (depth == 1) + { + ulong tmp; + DFT2_NODE0_LAZY24(p[0], p[1], F->mod2, tmp); + } + else if (depth == 2) + { + ulong p_hi, p_lo; + DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + } + else if (depth == 3) + dft8_node0_lazy24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); else if (depth == 4) - dft16_node0_lazy24(p, F); // in [0..2n), out [0..4n) - //else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) - // dft32_node0_lazy24(p, F); // in [0..2n), out [0..4n) + dft16_node0_lazy24(p, F); + else if (depth == 5) + dft32_node0_lazy24(p, F); else { - // input [0..2n) x [0..2n), output [0..2n) x [0..4n) - // (general accepts [0..4n) as input for depth >= 3) + // 4-point butterflies + // input p0,p1,p2,p3 in [0..2n) x [0..2n) x [0..2n) x [0..2n) + // output p0,p1,p2,p3 in [0..2n) x [0..4n) x [0..4n) x [0..4n) const nn_ptr p0 = p; const nn_ptr p1 = p + len/4; const nn_ptr p2 = p + 2*len/4; @@ -549,9 +558,11 @@ void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) for (ulong k = 0; k < len/4; k++) { DFT4_NODE0_LAZY24(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p[k] >= F->mod2) - p[k] -= F->mod2; + if (p0[k] >= F->mod2) + p0[k] -= F->mod2; } + + // 4 recursive calls with depth-2 dft_node0_lazy24(p0, len/4, depth-2, F); dft_lazy44(p1, len/4, depth-2, 1, F); dft_lazy44(p2, len/4, depth-2, 2, F); From 9bf18c7fdd6377dae064154f92576624c57487c2 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 11:14:08 +0200 Subject: [PATCH 19/71] testing from length 1 --- src/n_fft/dft.c | 4 +++- src/n_fft/test/t-dft.c | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 81ffe9804b..9eb5b9637d 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -528,7 +528,9 @@ void dft_node0_lazy24(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) // TODO try lazier variant for entry point < n, to see if any gain void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) { - // depth == 0: nothing to do + if (depth > 0) + return; + if (depth == 1) { ulong tmp; diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c index 4ca33dc933..756a389fb1 100644 --- a/src/n_fft/test/t-dft.c +++ b/src/n_fft/test/t-dft.c @@ -65,9 +65,9 @@ TEST_FUNCTION_START(n_fft_dft, state) n_fft_ctx_t F; n_fft_ctx_init2(F, MAX_EVAL_DEPTH, p); - for (ulong depth = 3; depth <= MAX_EVAL_DEPTH; depth++) + for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++) { - const ulong len = (1UL<tab_w[2*k]; - evals_br[2*k] = nmod_poly_evaluate_nmod(pol, point); - evals_br[2*k+1] = nmod_poly_evaluate_nmod(pol, nmod_neg(point, mod)); - } + if (len == 1) + evals_br[0] = nmod_poly_evaluate_nmod(pol, UWORD(1)); + else + for (ulong k = 0; k < len/2; k++) + { + ulong point = F->tab_w[2*k]; + evals_br[2*k] = nmod_poly_evaluate_nmod(pol, point); + evals_br[2*k+1] = nmod_poly_evaluate_nmod(pol, nmod_neg(point, mod)); + } // evals by DFT ulong * p = _nmod_vec_init(len); From fb88c54f47de482152bc2ecd255678271d71abcc Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 11:20:47 +0200 Subject: [PATCH 20/71] fix --- src/n_fft.h | 12 +++++++----- src/n_fft/dft.c | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 5299f4d557..7e741d1cfe 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -20,6 +20,12 @@ extern "C" { #endif +/** + * TODO[short term] confirm the limit on the modulus + * TODO[short term] add testing for general variants, not only node0 + * TODO[longer term] large depth can lead to heavy memory usage + * --> provide precomputation-free functions + */ /** n_fft context: * parameters and tabulated powers of the primitive root of unity "w". @@ -41,10 +47,6 @@ typedef n_fft_ctx_struct n_fft_ctx_t[1]; * - mod is an odd prime < 2**61 * - max_depth must be >= 3 (so, 8 must divide mod - 1) * Total memory cost of precomputations: <= 128 + 2**(depth+1) ulong's - * - * TODO[short term] confirm the limit on the modulus - * TODO[longer term] large depth can lead to heavy memory usage - * --> provide precomputation-free functions **/ /** tab_w2: @@ -108,7 +110,7 @@ void n_fft_ctx_clear(n_fft_ctx_t F); /** dft: - * transforms, inverse transforms, transposed transforms + * transforms / inverse transforms / transposed transforms * at length a power of 2 */ void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F); diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 9eb5b9637d..9fcef8b6a0 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -528,7 +528,7 @@ void dft_node0_lazy24(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) // TODO try lazier variant for entry point < n, to see if any gain void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) { - if (depth > 0) + if (depth == 0) return; if (depth == 1) From f6cc96ca140f17638a60723f486cfb559fa28f44 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 11:25:30 +0200 Subject: [PATCH 21/71] remove useless function argument --- src/n_fft.h | 2 +- src/n_fft/dft.c | 45 ++++++++++++++++++++++----------------- src/n_fft/profile/p-dft.c | 2 +- src/n_fft/test/t-dft.c | 2 +- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 7e741d1cfe..79ab60bb9c 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -113,7 +113,7 @@ void n_fft_ctx_clear(n_fft_ctx_t F); * transforms / inverse transforms / transposed transforms * at length a power of 2 */ -void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F); +void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F); diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 9fcef8b6a0..d29f707607 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -439,8 +439,7 @@ FLINT_FORCE_INLINE void dft32_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) * 3 <= depth * (node+1) * 2**depth <= 2**F.depth (length of F->tab_w) */ -// TODO remove argument len -void dft_lazy44(nn_ptr p, ulong len, ulong depth, ulong node, n_fft_ctx_t F) +void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_ctx_t F) { if (depth == 3) dft8_lazy44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F); @@ -450,6 +449,8 @@ void dft_lazy44(nn_ptr p, ulong len, ulong depth, ulong node, n_fft_ctx_t F) dft32_lazy44(p, node, F); else { + const ulong len = UWORD(1) << depth; + // 4-point butterflies // in: [0..4n), out: [0..4n) const nn_ptr p0 = p; @@ -473,10 +474,10 @@ void dft_lazy44(nn_ptr p, ulong len, ulong depth, ulong node, n_fft_ctx_t F) } // 4 recursive calls with depth-2 - dft_lazy44(p0, len/4, depth-2, 4*node, F); - dft_lazy44(p1, len/4, depth-2, 4*node+1, F); - dft_lazy44(p2, len/4, depth-2, 4*node+2, F); - dft_lazy44(p3, len/4, depth-2, 4*node+3, F); + dft_lazy44(p0, depth-2, 4*node, F); + dft_lazy44(p1, depth-2, 4*node+1, F); + dft_lazy44(p2, depth-2, 4*node+2, F); + dft_lazy44(p3, depth-2, 4*node+3, F); } } @@ -492,16 +493,18 @@ void dft_lazy44(nn_ptr p, ulong len, ulong depth, ulong node, n_fft_ctx_t F) * root of unity with exponents listed in bit reversed order * * Requirements (not checked): 3 <= depth <= F.depth */ -void dft_node0_lazy24(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) +void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_ctx_t F) { if (depth == 3) - dft8_node0_lazy24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); // in [0..2n), out [0..4n) + dft8_node0_lazy24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); else if (depth == 4) - dft16_node0_lazy24(p, F); // in [0..2n), out [0..4n) - else if (depth == 5) // TODO unclear this helps (no acceleration on argiope) - dft32_node0_lazy24(p, F); // in [0..2n), out [0..4n) + dft16_node0_lazy24(p, F); + else if (depth == 5) + dft32_node0_lazy24(p, F); else { + const ulong len = UWORD(1) << depth; + // 4-point butterflies // input p0,p1,p2,p3 in [0..2n) x [0..2n) x [0..2n) x [0..2n) // output p0,p1,p2,p3 in [0..2n) x [0..4n) x [0..4n) x [0..4n) @@ -518,15 +521,15 @@ void dft_node0_lazy24(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) } // 4 recursive calls with depth-2 - dft_node0_lazy24(p0, len/4, depth-2, F); - dft_lazy44(p1, len/4, depth-2, 1, F); - dft_lazy44(p2, len/4, depth-2, 2, F); - dft_lazy44(p3, len/4, depth-2, 3, F); + dft_node0_lazy24(p0, depth-2, F); + dft_lazy44(p1, depth-2, 1, F); + dft_lazy44(p2, depth-2, 2, F); + dft_lazy44(p3, depth-2, 3, F); } } // TODO try lazier variant for entry point < n, to see if any gain -void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) +void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) { if (depth == 0) return; @@ -549,6 +552,8 @@ void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) dft32_node0_lazy24(p, F); else { + const ulong len = UWORD(1) << depth; + // 4-point butterflies // input p0,p1,p2,p3 in [0..2n) x [0..2n) x [0..2n) x [0..2n) // output p0,p1,p2,p3 in [0..2n) x [0..4n) x [0..4n) x [0..4n) @@ -565,10 +570,10 @@ void n_fft_dft(nn_ptr p, ulong len, ulong depth, n_fft_ctx_t F) } // 4 recursive calls with depth-2 - dft_node0_lazy24(p0, len/4, depth-2, F); - dft_lazy44(p1, len/4, depth-2, 1, F); - dft_lazy44(p2, len/4, depth-2, 2, F); - dft_lazy44(p3, len/4, depth-2, 3, F); + dft_node0_lazy24(p0, depth-2, F); + dft_lazy44(p1, depth-2, 1, F); + dft_lazy44(p2, depth-2, 2, F); + dft_lazy44(p3, depth-2, 3, F); } } diff --git a/src/n_fft/profile/p-dft.c b/src/n_fft/profile/p-dft.c index 1ec5540648..2bf21cceeb 100644 --- a/src/n_fft/profile/p-dft.c +++ b/src/n_fft/profile/p-dft.c @@ -42,7 +42,7 @@ void sample_##fun##_variant(void * arg, ulong count) { \ prof_start(); \ for (ulong j = 0; j < rep; j++) \ - n_fft_##fun##_variant(coeffs, len, depth, F); \ + n_fft_##fun##_variant(coeffs, depth, F); \ prof_stop(); \ } \ \ diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c index 756a389fb1..7273d55a05 100644 --- a/src/n_fft/test/t-dft.c +++ b/src/n_fft/test/t-dft.c @@ -90,7 +90,7 @@ TEST_FUNCTION_START(n_fft_dft, state) ulong * p = _nmod_vec_init(len); _nmod_vec_set(p, pol->coeffs, len); - n_fft_dft(p, len, depth, F); + n_fft_dft(p, depth, F); int res = nmod_vec_red_equal(evals_br, p, len, mod); From a675b6808e5cf28cc69490f8d2dd9bd530d10f23 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 11:50:40 +0200 Subject: [PATCH 22/71] vaguely faster with added lazy14 layer --- src/n_fft/dft.c | 182 ++++++++++++++++++++++++++++++++++++-- src/n_fft/profile/p-dft.c | 4 - 2 files changed, 176 insertions(+), 10 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index d29f707607..4b77e41bce 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -48,6 +48,19 @@ (a) -= (n2); \ } while(0) +/** TODO Cooley-Tukey butterfly, node 0 + * * in [0..n) x [0..n) / out [0..2n) x [0..2n) / max < 2n + * * In-place transform + * [1 1] + * [a b] <- [a b] [1 -1] + */ +#define DFT2_NODE0_LAZY12(a, b, n, tmp) \ + do { \ + tmp = (b); \ + (b) = (a) + (n) - tmp; \ + (a) = (a) + tmp; \ + } while(0) + /** Cooley-Tukey butterfly, general * * in [0..4n) / out [0..4n) / max < 4n @@ -113,6 +126,40 @@ (d) = v5 + (n2) - v7; /* < 4*n */ \ } while(0) +/** TODO 4-point DFT, node 0 + * * in [0..n) / out [0..4n) / max < 4n + * * In-place transform + * [1 1 1 1] + * [1 -1 I -I] + * [a b c d] <- [a b c d] [1 1 -1 -1] + * [1 -1 -I I] + * * Corresponds to reducing down the tree with nodes + * x^4 - 1 + * / \ + * x^2 - 1 x^2 + 1 + * / \ / \ + * x - 1 x + 1 x - I x + I + * where I is typically a square root of -1 + * (but this property is not exploited) + */ +#define DFT4_NODE0_LAZY14(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ + do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v2; /* < 2*n */ \ + ulong v5 = v0 + (n) - v2; /* < 2*n */ \ + ulong v6 = v1 + v3; /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n2) - v3, (I_pr), (n), \ + p_hi, p_lo); \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v4 + (n2) - v6; /* < 4*n */ \ + (c) = v5 + v7; /* < 3*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ + } while(0) + /** 4-point DFT, general * * in [0..4n) / out [0..4n) / max < 8n * * In-place transform @@ -199,6 +246,33 @@ FLINT_FORCE_INLINE void dft8_node0_lazy24(ulong * p0, ulong * p1, ulong * p2, ul F->mod, F->mod2, p_hi, p_lo, tmp); } +/** TODO 8-point DFT, node 0 + * * in [0..n) / out [0..4n) / max < 8n + * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial + * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations + * p(1), p(-1), p(I), p(-I), p(J), p(-J), p(I*J), p(-I*J) + * at all 8-th roots of unity J**k for 0 <= k < 8 in bit-reversed order. + * * Recall [F->tab_w[2*k] for k in range(4)] == [1, I, J, IJ] + */ +FLINT_FORCE_INLINE void dft8_node0_lazy14(ulong * p0, ulong * p1, ulong * p2, ulong * p3, + ulong * p4, ulong * p5, ulong * p6, ulong * p7, + n_fft_ctx_t F) +{ + ulong p_hi, p_lo, tmp; + + DFT2_NODE0_LAZY12(*p0, *p4, F->mod, tmp); + DFT2_NODE0_LAZY12(*p1, *p5, F->mod, tmp); + DFT2_NODE0_LAZY12(*p2, *p6, F->mod, tmp); + DFT2_NODE0_LAZY12(*p3, *p7, F->mod, tmp); + + DFT4_NODE0_LAZY24(*p0, *p1, *p2, *p3, F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_LAZY44(*p4, *p5, *p6, *p7, + F->tab_w[2], F->tab_w[3], + F->tab_w[4], F->tab_w[5], + F->tab_w[6], F->tab_w[7], + F->mod, F->mod2, p_hi, p_lo, tmp); +} + /** 8-point DFT * * in [0..4n) / out [0..4n) / max < 8n * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial @@ -278,6 +352,46 @@ FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, n_fft_ctx_t F) F->mod, F->mod2, p_hi, p_lo, tmp); } +/** TODO 16-point DFT, node 0 + * * in [0..n) / out [0..4n) / max < 8n + * * Same specification as dft_node0_lazy24, for depth==4 + */ +FLINT_FORCE_INLINE void dft16_node0_lazy14(nn_ptr p, n_fft_ctx_t F) +{ + ulong p_hi, p_lo, tmp; + + DFT4_NODE0_LAZY14(p[0], p[4], p[ 8], p[12], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p[0] >= F->mod2) + p[0] -= F->mod2; + DFT4_NODE0_LAZY14(p[1], p[5], p[ 9], p[13], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p[1] >= F->mod2) + p[1] -= F->mod2; + DFT4_NODE0_LAZY14(p[2], p[6], p[10], p[14], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p[2] >= F->mod2) + p[2] -= F->mod2; + DFT4_NODE0_LAZY14(p[3], p[7], p[11], p[15], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p[3] >= F->mod2) + p[3] -= F->mod2; + + // next line requires < 2n, hence the four reductions above + DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_LAZY44(p[4], p[5], p[6], p[7], + F->tab_w[2], F->tab_w[3], + F->tab_w[4], F->tab_w[5], + F->tab_w[6], F->tab_w[7], + F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[8], p[9], p[10], p[11], + F->tab_w[4], F->tab_w[5], + F->tab_w[8], F->tab_w[9], + F->tab_w[10], F->tab_w[11], + F->mod, F->mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[12], p[13], p[14], p[15], + F->tab_w[6], F->tab_w[7], + F->tab_w[12], F->tab_w[13], + F->tab_w[14], F->tab_w[15], + F->mod, F->mod2, p_hi, p_lo, tmp); +} + /** 16-point DFT * * in [0..4n) / out [0..4n) / max < 8n * * Same specification as dft_lazy44, for depth==4 @@ -392,6 +506,62 @@ FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, n_fft_ctx_t F) dft8_lazy44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F); } +/** TODO 32-point DFT, node 0 + * * in [0..n) / out [0..4n) / max < 8n + * * Same specification as dft_node0_lazy24, for depth==5 + */ +FLINT_FORCE_INLINE void dft32_node0_lazy14(nn_ptr p, n_fft_ctx_t F) +{ + ulong p_hi, p_lo; + + DFT4_NODE0_LAZY14(p[0], p[8 ], p[16], p[24], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[0] >= F->mod2) + p[0] -= F->mod2; + DFT4_NODE0_LAZY14(p[1], p[9 ], p[17], p[25], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[1] >= F->mod2) + p[1] -= F->mod2; + DFT4_NODE0_LAZY14(p[2], p[10], p[18], p[26], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[2] >= F->mod2) + p[2] -= F->mod2; + DFT4_NODE0_LAZY14(p[3], p[11], p[19], p[27], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[3] >= F->mod2) + p[3] -= F->mod2; + DFT4_NODE0_LAZY14(p[4], p[12], p[20], p[28], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[4] >= F->mod2) + p[4] -= F->mod2; + DFT4_NODE0_LAZY14(p[5], p[13], p[21], p[29], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[5] >= F->mod2) + p[5] -= F->mod2; + DFT4_NODE0_LAZY14(p[6], p[14], p[22], p[30], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[6] >= F->mod2) + p[6] -= F->mod2; + DFT4_NODE0_LAZY14(p[7], p[15], p[23], p[31], + F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + if (p[7] >= F->mod2) + p[7] -= F->mod2; + + // next line requires < 2n, hence the 8 reductions above + dft8_node0_lazy24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, F); + dft8_lazy44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, F); + dft8_lazy44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, F); + dft8_lazy44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F); +} + /** 32-point DFT * * in [0..4n) / out [0..4n) / max < 8n * * Same specification as dft_lazy44, for depth==5 @@ -537,19 +707,19 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) if (depth == 1) { ulong tmp; - DFT2_NODE0_LAZY24(p[0], p[1], F->mod2, tmp); + DFT2_NODE0_LAZY12(p[0], p[1], F->mod, tmp); } else if (depth == 2) { ulong p_hi, p_lo; - DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY14(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); } else if (depth == 3) - dft8_node0_lazy24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); + dft8_node0_lazy14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); else if (depth == 4) - dft16_node0_lazy24(p, F); + dft16_node0_lazy14(p, F); else if (depth == 5) - dft32_node0_lazy24(p, F); + dft32_node0_lazy14(p, F); else { const ulong len = UWORD(1) << depth; @@ -564,7 +734,7 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) ulong p_hi, p_lo; for (ulong k = 0; k < len/4; k++) { - DFT4_NODE0_LAZY24(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY14(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p0[k] >= F->mod2) p0[k] -= F->mod2; } diff --git a/src/n_fft/profile/p-dft.c b/src/n_fft/profile/p-dft.c index 2bf21cceeb..b9a2e89c9d 100644 --- a/src/n_fft/profile/p-dft.c +++ b/src/n_fft/profile/p-dft.c @@ -136,7 +136,3 @@ int main() } return 0; } - -/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ -// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s - From 28b32761a9defaac8bd507d31d6d423acc98460c Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 12:01:14 +0200 Subject: [PATCH 23/71] clean explanations --- src/n_fft/dft.c | 121 +++++++++++++++++--------------------- src/n_fft/profile/p-dft.c | 2 +- 2 files changed, 55 insertions(+), 68 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 4b77e41bce..899ee43b39 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -32,6 +32,20 @@ /* 2-point DFT */ /*-------------*/ +/** Cooley-Tukey butterfly, node 0 + * * in [0..n) x [0..n) / out [0..2n) x [0..2n) / max < 2n + * * In-place transform + * [1 1] + * [a b] <- [a b] [1 -1] + * * n is the modulus, tmp is a temporary + */ +#define DFT2_NODE0_LAZY12(a, b, n, tmp) \ + do { \ + tmp = (b); \ + (b) = (a) + (n) - tmp; \ + (a) = (a) + tmp; \ + } while(0) + /** Cooley-Tukey butterfly, node 0 * * in [0..2n) x [0..2n) / out [0..2n) x [0..4n) / max < 4n * * In-place transform @@ -48,20 +62,6 @@ (a) -= (n2); \ } while(0) -/** TODO Cooley-Tukey butterfly, node 0 - * * in [0..n) x [0..n) / out [0..2n) x [0..2n) / max < 2n - * * In-place transform - * [1 1] - * [a b] <- [a b] [1 -1] - */ -#define DFT2_NODE0_LAZY12(a, b, n, tmp) \ - do { \ - tmp = (b); \ - (b) = (a) + (n) - tmp; \ - (a) = (a) + tmp; \ - } while(0) - - /** Cooley-Tukey butterfly, general * * in [0..4n) / out [0..4n) / max < 4n * * In-place transform @@ -87,7 +87,7 @@ /*-------------*/ /** 4-point DFT, node 0 - * * in [0..2n) / out [0..4n) / max < 4n + * * in [0..n) / out [0..4n) / max < 4n * * In-place transform * [1 1 1 1] * [1 -1 I -I] @@ -101,62 +101,51 @@ * x - 1 x + 1 x - I x + I * where I is typically a square root of -1 * (but this property is not exploited) + * * n is the modulus and n2 == 2*n, p_hi, p_lo are temporaries */ -#define DFT4_NODE0_LAZY24(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ +#define DFT4_NODE0_LAZY14(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ do { \ const ulong v0 = (a); \ const ulong v1 = (b); \ const ulong v2 = (c); \ const ulong v3 = (d); \ - ulong v4 = v0 + v2; /* < 4*n */ \ - if (v4 >= (n2)) \ - v4 -= (n2); /* < 2*n */ \ - ulong v5 = v0 + (n2) - v2; /* < 4*n */ \ - if (v5 >= (n2)) \ - v5 -= (n2); /* < 2*n */ \ - ulong v6 = v1 + v3; /* < 4*n */ \ - if (v6 >= (n2)) \ - v6 -= (n2); /* < 2*n */ \ + ulong v4 = v0 + v2; /* < 2*n */ \ + ulong v5 = v0 + (n) - v2; /* < 2*n */ \ + ulong v6 = v1 + v3; /* < 2*n */ \ ulong v7; \ - N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n2) - v3, (I_pr), (n), \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n) - v3, (I_pr), (n), \ p_hi, p_lo); \ (a) = v4 + v6; /* < 4*n */ \ (b) = v4 + (n2) - v6; /* < 4*n */ \ - (c) = v5 + v7; /* < 4*n */ \ + (c) = v5 + v7; /* < 3*n */ \ (d) = v5 + (n2) - v7; /* < 4*n */ \ } while(0) -/** TODO 4-point DFT, node 0 - * * in [0..n) / out [0..4n) / max < 4n - * * In-place transform - * [1 1 1 1] - * [1 -1 I -I] - * [a b c d] <- [a b c d] [1 1 -1 -1] - * [1 -1 -I I] - * * Corresponds to reducing down the tree with nodes - * x^4 - 1 - * / \ - * x^2 - 1 x^2 + 1 - * / \ / \ - * x - 1 x + 1 x - I x + I - * where I is typically a square root of -1 - * (but this property is not exploited) +/** 4-point DFT, node 0 + * * in [0..2n) / out [0..4n) / max < 4n + * * other than this, same specification as DFT4_NODE0_LAZY14 */ -#define DFT4_NODE0_LAZY14(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ +#define DFT4_NODE0_LAZY24(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ do { \ const ulong v0 = (a); \ const ulong v1 = (b); \ const ulong v2 = (c); \ const ulong v3 = (d); \ - ulong v4 = v0 + v2; /* < 2*n */ \ - ulong v5 = v0 + (n) - v2; /* < 2*n */ \ - ulong v6 = v1 + v3; /* < 2*n */ \ + ulong v4 = v0 + v2; /* < 4*n */ \ + if (v4 >= (n2)) \ + v4 -= (n2); /* < 2*n */ \ + ulong v5 = v0 + (n2) - v2; /* < 4*n */ \ + if (v5 >= (n2)) \ + v5 -= (n2); /* < 2*n */ \ + ulong v6 = v1 + v3; /* < 4*n */ \ + if (v6 >= (n2)) \ + v6 -= (n2); /* < 2*n */ \ ulong v7; \ N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n2) - v3, (I_pr), (n), \ p_hi, p_lo); \ (a) = v4 + v6; /* < 4*n */ \ (b) = v4 + (n2) - v6; /* < 4*n */ \ - (c) = v5 + v7; /* < 3*n */ \ + (c) = v5 + v7; /* < 4*n */ \ (d) = v5 + (n2) - v7; /* < 4*n */ \ } while(0) @@ -173,7 +162,7 @@ * x^2 - w1 x^2 + w1 * / \ / \ * x - w2 x + w2 x - w3 x + w3 - * typically w2**2 == w1 and w3 == I*w2 (so that w3**2 == -w1) so that this + * typically w2**2 == w1 and w3 == I*w2 (hence w3**2 == -w1) so that this * really is the subproduct tree built from the four roots * w2, -w2, I*w2, -I*w2 of x**4 - w1 */ @@ -220,25 +209,27 @@ do { \ /*-------------*/ /** 8-point DFT, node 0 - * * in [0..2n) / out [0..4n) / max < 8n + * * in [0..n) / out [0..4n) / max < 8n * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations * p(1), p(-1), p(I), p(-I), p(J), p(-J), p(I*J), p(-I*J) - * at all 8-th roots of unity J**k for 0 <= k < 8 in bit-reversed order. + * i.e. the evaluations at all 8-th roots of unity J**k for 0 <= k < 8 in + * bit-reversed order * * Recall [F->tab_w[2*k] for k in range(4)] == [1, I, J, IJ] */ -FLINT_FORCE_INLINE void dft8_node0_lazy24(ulong * p0, ulong * p1, ulong * p2, ulong * p3, +FLINT_FORCE_INLINE void dft8_node0_lazy14(ulong * p0, ulong * p1, ulong * p2, ulong * p3, ulong * p4, ulong * p5, ulong * p6, ulong * p7, n_fft_ctx_t F) { ulong p_hi, p_lo, tmp; - DFT2_NODE0_LAZY24(*p0, *p4, F->mod2, tmp); - DFT2_NODE0_LAZY24(*p1, *p5, F->mod2, tmp); - DFT2_NODE0_LAZY24(*p2, *p6, F->mod2, tmp); - DFT2_NODE0_LAZY24(*p3, *p7, F->mod2, tmp); + DFT2_NODE0_LAZY12(*p0, *p4, F->mod, tmp); + DFT2_NODE0_LAZY12(*p1, *p5, F->mod, tmp); + DFT2_NODE0_LAZY12(*p2, *p6, F->mod, tmp); + DFT2_NODE0_LAZY12(*p3, *p7, F->mod, tmp); DFT4_NODE0_LAZY24(*p0, *p1, *p2, *p3, F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + // could use a lazy24 variant of the next macro, but the gain is negligible DFT4_LAZY44(*p4, *p5, *p6, *p7, F->tab_w[2], F->tab_w[3], F->tab_w[4], F->tab_w[5], @@ -246,24 +237,20 @@ FLINT_FORCE_INLINE void dft8_node0_lazy24(ulong * p0, ulong * p1, ulong * p2, ul F->mod, F->mod2, p_hi, p_lo, tmp); } -/** TODO 8-point DFT, node 0 - * * in [0..n) / out [0..4n) / max < 8n - * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial - * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations - * p(1), p(-1), p(I), p(-I), p(J), p(-J), p(I*J), p(-I*J) - * at all 8-th roots of unity J**k for 0 <= k < 8 in bit-reversed order. - * * Recall [F->tab_w[2*k] for k in range(4)] == [1, I, J, IJ] +/** 8-point DFT, node 0 + * * in [0..2n) / out [0..4n) / max < 8n + * * apart from these ranges, same specification as dft8_node0_lazy14 */ -FLINT_FORCE_INLINE void dft8_node0_lazy14(ulong * p0, ulong * p1, ulong * p2, ulong * p3, +FLINT_FORCE_INLINE void dft8_node0_lazy24(ulong * p0, ulong * p1, ulong * p2, ulong * p3, ulong * p4, ulong * p5, ulong * p6, ulong * p7, n_fft_ctx_t F) { ulong p_hi, p_lo, tmp; - DFT2_NODE0_LAZY12(*p0, *p4, F->mod, tmp); - DFT2_NODE0_LAZY12(*p1, *p5, F->mod, tmp); - DFT2_NODE0_LAZY12(*p2, *p6, F->mod, tmp); - DFT2_NODE0_LAZY12(*p3, *p7, F->mod, tmp); + DFT2_NODE0_LAZY24(*p0, *p4, F->mod2, tmp); + DFT2_NODE0_LAZY24(*p1, *p5, F->mod2, tmp); + DFT2_NODE0_LAZY24(*p2, *p6, F->mod2, tmp); + DFT2_NODE0_LAZY24(*p3, *p7, F->mod2, tmp); DFT4_NODE0_LAZY24(*p0, *p1, *p2, *p3, F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); DFT4_LAZY44(*p4, *p5, *p6, *p7, diff --git a/src/n_fft/profile/p-dft.c b/src/n_fft/profile/p-dft.c index b9a2e89c9d..733efa957d 100644 --- a/src/n_fft/profile/p-dft.c +++ b/src/n_fft/profile/p-dft.c @@ -29,7 +29,7 @@ void sample_##fun##_variant(void * arg, ulong count) nmod_t mod; \ nmod_init(&mod, p); \ ulong w0 = nmod_pow_ui(n_primitive_root_prime(p), (p - 1) >> maxdepth, mod); \ - ulong w = nmod_pow_ui(w0, 1UL<<(maxdepth - depth), mod); \ + ulong w = nmod_pow_ui(w0, UWORD(1)<<(maxdepth - depth), mod); \ n_fft_ctx_t F; \ n_fft_ctx_init2_root(F, w, depth, depth, p); \ \ From b71649dd540027ffce391cd11734bd1da8109440 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 12:08:34 +0200 Subject: [PATCH 24/71] finalize lazy14 version --- src/n_fft/dft.c | 93 ++++++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 44 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 899ee43b39..eee77d5d9f 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -300,23 +300,23 @@ FLINT_FORCE_INLINE void dft8_lazy44(ulong * p0, ulong * p1, ulong * p2, ulong * /*--------------*/ /** 16-point DFT, node 0 - * * in [0..2n) / out [0..4n) / max < 8n - * * Same specification as dft_node0_lazy24, for depth==4 + * * in [0..n) / out [0..4n) / max < 8n + * * Apart from this range, same specification as dft_node0_lazy24, for depth==4 */ -FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, n_fft_ctx_t F) +FLINT_FORCE_INLINE void dft16_node0_lazy14(nn_ptr p, n_fft_ctx_t F) { ulong p_hi, p_lo, tmp; - DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY14(p[0], p[4], p[ 8], p[12], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[0] >= F->mod2) p[0] -= F->mod2; - DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY14(p[1], p[5], p[ 9], p[13], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[1] >= F->mod2) p[1] -= F->mod2; - DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY14(p[2], p[6], p[10], p[14], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[2] >= F->mod2) p[2] -= F->mod2; - DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY14(p[3], p[7], p[11], p[15], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[3] >= F->mod2) p[3] -= F->mod2; @@ -339,24 +339,24 @@ FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, n_fft_ctx_t F) F->mod, F->mod2, p_hi, p_lo, tmp); } -/** TODO 16-point DFT, node 0 - * * in [0..n) / out [0..4n) / max < 8n +/** 16-point DFT, node 0 + * * in [0..2n) / out [0..4n) / max < 8n * * Same specification as dft_node0_lazy24, for depth==4 */ -FLINT_FORCE_INLINE void dft16_node0_lazy14(nn_ptr p, n_fft_ctx_t F) +FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, n_fft_ctx_t F) { ulong p_hi, p_lo, tmp; - DFT4_NODE0_LAZY14(p[0], p[4], p[ 8], p[12], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[0] >= F->mod2) p[0] -= F->mod2; - DFT4_NODE0_LAZY14(p[1], p[5], p[ 9], p[13], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[1] >= F->mod2) p[1] -= F->mod2; - DFT4_NODE0_LAZY14(p[2], p[6], p[10], p[14], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[2] >= F->mod2) p[2] -= F->mod2; - DFT4_NODE0_LAZY14(p[3], p[7], p[11], p[15], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[3] >= F->mod2) p[3] -= F->mod2; @@ -438,49 +438,49 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) /*--------------*/ /** 32-point DFT, node 0 - * * in [0..2n) / out [0..4n) / max < 8n - * * Same specification as dft_node0_lazy24, for depth==5 + * * in [0..n) / out [0..4n) / max < 8n + * * Apart from this range, same specification as dft_node0_lazy24, for depth==5 */ -FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, n_fft_ctx_t F) +FLINT_FORCE_INLINE void dft32_node0_lazy14(nn_ptr p, n_fft_ctx_t F) { ulong p_hi, p_lo; - DFT4_NODE0_LAZY24(p[0], p[8 ], p[16], p[24], + DFT4_NODE0_LAZY14(p[0], p[8 ], p[16], p[24], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[0] >= F->mod2) p[0] -= F->mod2; - DFT4_NODE0_LAZY24(p[1], p[9 ], p[17], p[25], + DFT4_NODE0_LAZY14(p[1], p[9 ], p[17], p[25], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[1] >= F->mod2) p[1] -= F->mod2; - DFT4_NODE0_LAZY24(p[2], p[10], p[18], p[26], + DFT4_NODE0_LAZY14(p[2], p[10], p[18], p[26], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[2] >= F->mod2) p[2] -= F->mod2; - DFT4_NODE0_LAZY24(p[3], p[11], p[19], p[27], + DFT4_NODE0_LAZY14(p[3], p[11], p[19], p[27], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[3] >= F->mod2) p[3] -= F->mod2; - DFT4_NODE0_LAZY24(p[4], p[12], p[20], p[28], + DFT4_NODE0_LAZY14(p[4], p[12], p[20], p[28], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[4] >= F->mod2) p[4] -= F->mod2; - DFT4_NODE0_LAZY24(p[5], p[13], p[21], p[29], + DFT4_NODE0_LAZY14(p[5], p[13], p[21], p[29], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[5] >= F->mod2) p[5] -= F->mod2; - DFT4_NODE0_LAZY24(p[6], p[14], p[22], p[30], + DFT4_NODE0_LAZY14(p[6], p[14], p[22], p[30], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[6] >= F->mod2) p[6] -= F->mod2; - DFT4_NODE0_LAZY24(p[7], p[15], p[23], p[31], + DFT4_NODE0_LAZY14(p[7], p[15], p[23], p[31], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[7] >= F->mod2) @@ -493,50 +493,50 @@ FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, n_fft_ctx_t F) dft8_lazy44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F); } -/** TODO 32-point DFT, node 0 - * * in [0..n) / out [0..4n) / max < 8n +/** 32-point DFT, node 0 + * * in [0..2n) / out [0..4n) / max < 8n * * Same specification as dft_node0_lazy24, for depth==5 */ -FLINT_FORCE_INLINE void dft32_node0_lazy14(nn_ptr p, n_fft_ctx_t F) +FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, n_fft_ctx_t F) { ulong p_hi, p_lo; - DFT4_NODE0_LAZY14(p[0], p[8 ], p[16], p[24], + DFT4_NODE0_LAZY24(p[0], p[8 ], p[16], p[24], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[0] >= F->mod2) p[0] -= F->mod2; - DFT4_NODE0_LAZY14(p[1], p[9 ], p[17], p[25], + DFT4_NODE0_LAZY24(p[1], p[9 ], p[17], p[25], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[1] >= F->mod2) p[1] -= F->mod2; - DFT4_NODE0_LAZY14(p[2], p[10], p[18], p[26], + DFT4_NODE0_LAZY24(p[2], p[10], p[18], p[26], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[2] >= F->mod2) p[2] -= F->mod2; - DFT4_NODE0_LAZY14(p[3], p[11], p[19], p[27], + DFT4_NODE0_LAZY24(p[3], p[11], p[19], p[27], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[3] >= F->mod2) p[3] -= F->mod2; - DFT4_NODE0_LAZY14(p[4], p[12], p[20], p[28], + DFT4_NODE0_LAZY24(p[4], p[12], p[20], p[28], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[4] >= F->mod2) p[4] -= F->mod2; - DFT4_NODE0_LAZY14(p[5], p[13], p[21], p[29], + DFT4_NODE0_LAZY24(p[5], p[13], p[21], p[29], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[5] >= F->mod2) p[5] -= F->mod2; - DFT4_NODE0_LAZY14(p[6], p[14], p[22], p[30], + DFT4_NODE0_LAZY24(p[6], p[14], p[22], p[30], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[6] >= F->mod2) p[6] -= F->mod2; - DFT4_NODE0_LAZY14(p[7], p[15], p[23], p[31], + DFT4_NODE0_LAZY24(p[7], p[15], p[23], p[31], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); if (p[7] >= F->mod2) @@ -685,7 +685,18 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_ctx_t F) } } -// TODO try lazier variant for entry point < n, to see if any gain +/** 2**depth-point DFT + * * in [0..n) / out [0..4n) / max < 8n + * * In-place transform p of length len == 2**depth into + * the concatenation of + * [sum(p[i] * w_k**i for i in range(len), sum(p[i] * (-w_k)**i for i in range(len)] + * for k in range(len), + * where w_k = F->tab_w[2*k] for 0 <= k < 2**(depth-1) + * * By construction these evaluation points are the roots of the polynomial + * x**len - 1, precisely they are all powers of the chosen len-th primitive + * root of unity with exponents listed in bit reversed order + * * Requirements (not checked): depth <= F.depth + */ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) { if (depth == 0) @@ -712,7 +723,7 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) const ulong len = UWORD(1) << depth; // 4-point butterflies - // input p0,p1,p2,p3 in [0..2n) x [0..2n) x [0..2n) x [0..2n) + // input p0,p1,p2,p3 in [0..n) x [0..n) x [0..n) x [0..n) // output p0,p1,p2,p3 in [0..2n) x [0..4n) x [0..4n) x [0..4n) const nn_ptr p0 = p; const nn_ptr p1 = p + len/4; @@ -733,9 +744,3 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) dft_lazy44(p3, depth-2, 3, F); } } - - - - - - From 8cd392ce208b0a85b2934bfda04b48247eb00a97 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 12:52:26 +0200 Subject: [PATCH 25/71] small fixes --- src/n_fft/dft.c | 2 +- src/n_fft/test/t-dft.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index eee77d5d9f..ee76f4f017 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -9,7 +9,7 @@ (at your option) any later version. See . */ -#include "longlong_asm_gcc.h" // TODO change to longlong +#include "longlong.h" #include "n_fft.h" /*---------*/ diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c index 7273d55a05..7176f116f0 100644 --- a/src/n_fft/test/t-dft.c +++ b/src/n_fft/test/t-dft.c @@ -53,17 +53,17 @@ TEST_FUNCTION_START(n_fft_dft, state) { // take some FFT prime p with max_depth >= 12 ulong max_depth = 12 + n_randint(state, 10); - ulong p = 1 + (UWORD(1) << max_depth); - while (! n_is_prime(p)) - p += (UWORD(1) << max_depth); - max_depth = flint_ctz(p-1); + ulong prime = 1 + (UWORD(1) << max_depth); + while (! n_is_prime(prime)) + prime += (UWORD(1) << max_depth); + max_depth = flint_ctz(prime-1); nmod_t mod; - nmod_init(&mod, p); + nmod_init(&mod, prime); // init FFT root tables n_fft_ctx_t F; - n_fft_ctx_init2(F, MAX_EVAL_DEPTH, p); + n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime); for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++) { From 9fa90204ace3c31f44b5020b9021642dc69fc79a Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 13:17:27 +0200 Subject: [PATCH 26/71] tentative fix for flint_bits == 32 --- src/n_fft/profile/p-init.c | 1 + src/n_fft/test/t-init.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/n_fft/profile/p-init.c b/src/n_fft/profile/p-init.c index 75378be6d1..c79e349f7e 100644 --- a/src/n_fft/profile/p-init.c +++ b/src/n_fft/profile/p-init.c @@ -107,6 +107,7 @@ int main() printf("depth\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\n"); + // TODO fix for FLINT_BITS==32 ulong primes[num_primes] = { 786433, // 20 bits, 1 + 2**18 * 3 2013265921, // 31 bits, 1 + 2**27 * 3 * 5 diff --git a/src/n_fft/test/t-init.c b/src/n_fft/test/t-init.c index b6164207a5..ae956156f5 100644 --- a/src/n_fft/test/t-init.c +++ b/src/n_fft/test/t-init.c @@ -92,8 +92,12 @@ TEST_FUNCTION_START(n_fft_ctx_init2, state) ulong p, max_depth; if (i % 20 != 0) { - // take random prime in [17, 2**61) + // take random prime in [17, 2**(FLINT_BITS-3)) +#if FLINT_BITS == 64 ulong bits = 5 + n_randint(state, 57); +#else + ulong bits = 5 + n_randint(state, 24); +#endif p = n_randprime(state, bits, 1); max_depth = flint_ctz(p-1); @@ -108,7 +112,11 @@ TEST_FUNCTION_START(n_fft_ctx_init2, state) { // the above will most often have max_depth 3 or 4 // every now and then we want p with larger max_depth +#if FLINT_BITS == 64 + max_depth = 40 + n_randint(state, 10); +#else max_depth = 10 + n_randint(state, 10); +#endif p = 1 + (UWORD(1) << max_depth); while (! n_is_prime(p)) p += (UWORD(1) << max_depth); From ccd3f71cf4cc53f1c5957130e43c2e7901c9286d Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 15:10:12 +0200 Subject: [PATCH 27/71] dft8 is now a macro, code generation was too unpredictable --- src/n_fft.h | 3 + src/n_fft/dft.c | 162 +++++++++++++++++++++++++----------------------- 2 files changed, 86 insertions(+), 79 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 79ab60bb9c..02ab93e77f 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -114,6 +114,9 @@ void n_fft_ctx_clear(n_fft_ctx_t F); * at length a power of 2 */ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F); +void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F); +void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); +void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index ee76f4f017..cf2f25cb83 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -9,9 +9,11 @@ (at your option) any later version. See . */ -#include "longlong.h" +#include "longlong_asm_gcc.h" #include "n_fft.h" +// TODO provide function which reduces output to [0..n) ? + /*---------*/ /* helpers */ /*---------*/ @@ -217,48 +219,51 @@ do { \ * bit-reversed order * * Recall [F->tab_w[2*k] for k in range(4)] == [1, I, J, IJ] */ -FLINT_FORCE_INLINE void dft8_node0_lazy14(ulong * p0, ulong * p1, ulong * p2, ulong * p3, - ulong * p4, ulong * p5, ulong * p6, ulong * p7, - n_fft_ctx_t F) -{ - ulong p_hi, p_lo, tmp; - - DFT2_NODE0_LAZY12(*p0, *p4, F->mod, tmp); - DFT2_NODE0_LAZY12(*p1, *p5, F->mod, tmp); - DFT2_NODE0_LAZY12(*p2, *p6, F->mod, tmp); - DFT2_NODE0_LAZY12(*p3, *p7, F->mod, tmp); - - DFT4_NODE0_LAZY24(*p0, *p1, *p2, *p3, F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - // could use a lazy24 variant of the next macro, but the gain is negligible - DFT4_LAZY44(*p4, *p5, *p6, *p7, - F->tab_w[2], F->tab_w[3], - F->tab_w[4], F->tab_w[5], - F->tab_w[6], F->tab_w[7], - F->mod, F->mod2, p_hi, p_lo, tmp); -} +#define DFT8_NODE0_LAZY14(p0, p1, p2, p3, p4, p5, p6, p7, \ + mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + \ + DFT2_NODE0_LAZY12(*(p0), *(p4), mod, tmp); \ + DFT2_NODE0_LAZY12(*(p1), *(p5), mod, tmp); \ + DFT2_NODE0_LAZY12(*(p2), *(p6), mod, tmp); \ + DFT2_NODE0_LAZY12(*(p3), *(p7), mod, tmp); \ + \ + DFT4_NODE0_LAZY24(*(p0), *(p1), *(p2), *(p3), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + /* could use a lazy24 variant of the next macro, */ \ + /* but the gain is negligible */ \ + DFT4_LAZY44(*(p4), *(p5), *(p6), *(p7), \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + mod, mod2, p_hi, p_lo, tmp); \ +} while(0) /** 8-point DFT, node 0 * * in [0..2n) / out [0..4n) / max < 8n - * * apart from these ranges, same specification as dft8_node0_lazy14 + * * apart from these ranges, same specification as DFT8_NODE0_LAZY14 */ -FLINT_FORCE_INLINE void dft8_node0_lazy24(ulong * p0, ulong * p1, ulong * p2, ulong * p3, - ulong * p4, ulong * p5, ulong * p6, ulong * p7, - n_fft_ctx_t F) -{ - ulong p_hi, p_lo, tmp; - - DFT2_NODE0_LAZY24(*p0, *p4, F->mod2, tmp); - DFT2_NODE0_LAZY24(*p1, *p5, F->mod2, tmp); - DFT2_NODE0_LAZY24(*p2, *p6, F->mod2, tmp); - DFT2_NODE0_LAZY24(*p3, *p7, F->mod2, tmp); - - DFT4_NODE0_LAZY24(*p0, *p1, *p2, *p3, F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - DFT4_LAZY44(*p4, *p5, *p6, *p7, - F->tab_w[2], F->tab_w[3], - F->tab_w[4], F->tab_w[5], - F->tab_w[6], F->tab_w[7], - F->mod, F->mod2, p_hi, p_lo, tmp); -} +#define DFT8_NODE0_LAZY24(p0, p1, p2, p3, p4, p5, p6, p7, \ + mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + \ + DFT2_NODE0_LAZY24(*(p0), *(p4), mod2, tmp); \ + DFT2_NODE0_LAZY24(*(p1), *(p5), mod2, tmp); \ + DFT2_NODE0_LAZY24(*(p2), *(p6), mod2, tmp); \ + DFT2_NODE0_LAZY24(*(p3), *(p7), mod2, tmp); \ + \ + DFT4_NODE0_LAZY24(*(p0), *(p1), *(p2), *(p3), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + DFT4_LAZY44(*(p4), *(p5), *(p6), *(p7), \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + mod, mod2, p_hi, p_lo, tmp); \ +} while(0) /** 8-point DFT * * in [0..4n) / out [0..4n) / max < 8n @@ -269,31 +274,30 @@ FLINT_FORCE_INLINE void dft8_node0_lazy24(ulong * p0, ulong * p1, ulong * p2, ul * * By construction these 8 evaluation points are the 8 roots of the * polynomial x**8 - F->tab_w[node] */ -FLINT_FORCE_INLINE void dft8_lazy44(ulong * p0, ulong * p1, ulong * p2, ulong * p3, - ulong * p4, ulong * p5, ulong * p6, ulong * p7, - ulong node, n_fft_ctx_t F) -{ - ulong p_hi, p_lo, u, v; - - const ulong w = F->tab_w[2*node]; - const ulong w_pr = F->tab_w[2*node+1]; - DFT2_LAZY44(*p0, *p4, F->mod, F->mod2, w, w_pr, p_hi, p_lo, u, v); - DFT2_LAZY44(*p1, *p5, F->mod, F->mod2, w, w_pr, p_hi, p_lo, u, v); - DFT2_LAZY44(*p2, *p6, F->mod, F->mod2, w, w_pr, p_hi, p_lo, u, v); - DFT2_LAZY44(*p3, *p7, F->mod, F->mod2, w, w_pr, p_hi, p_lo, u, v); - - DFT4_LAZY44(*p0, *p1, *p2, *p3, - F->tab_w[4*node], F->tab_w[4*node+1], - F->tab_w[8*node], F->tab_w[8*node+1], - F->tab_w[8*node+2], F->tab_w[8*node+3], - F->mod, F->mod2, p_hi, p_lo, u); - - DFT4_LAZY44(*p4, *p5, *p6, *p7, - F->tab_w[4*node+2], F->tab_w[4*node+3], - F->tab_w[8*node+4], F->tab_w[8*node+5], - F->tab_w[8*node+6], F->tab_w[8*node+7], - F->mod, F->mod2, p_hi, p_lo, u); -} +#define DFT8_LAZY44(p0, p1, p2, p3, p4, p5, p6, p7, \ + node, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, u, v; \ + \ + const ulong w = tab_w[2*(node)]; \ + const ulong w_pr = tab_w[2*(node)+1]; \ + DFT2_LAZY44(*(p0), *(p4), mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + DFT2_LAZY44(*(p1), *(p5), mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + DFT2_LAZY44(*(p2), *(p6), mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + DFT2_LAZY44(*(p3), *(p7), mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + \ + DFT4_LAZY44(*(p0), *(p1), *(p2), *(p3), \ + tab_w[4*(node)], tab_w[4*(node)+1], \ + tab_w[8*(node)], tab_w[8*(node)+1], \ + tab_w[8*(node)+2], tab_w[8*(node)+3], \ + mod, mod2, p_hi, p_lo, u); \ + \ + DFT4_LAZY44(*(p4), *(p5), *(p6), *(p7), \ + tab_w[4*(node)+2], tab_w[4*(node)+3], \ + tab_w[8*(node)+4], tab_w[8*(node)+5], \ + tab_w[8*(node)+6], tab_w[8*(node)+7], \ + mod, mod2, p_hi, p_lo, u); \ +} while(0) /*--------------*/ /* 16-point DFT */ @@ -487,10 +491,10 @@ FLINT_FORCE_INLINE void dft32_node0_lazy14(nn_ptr p, n_fft_ctx_t F) p[7] -= F->mod2; // next line requires < 2n, hence the 8 reductions above - dft8_node0_lazy24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, F); - dft8_lazy44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, F); - dft8_lazy44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, F); - dft8_lazy44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F); + DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F->mod, F->mod2, F->tab_w); } /** 32-point DFT, node 0 @@ -543,10 +547,10 @@ FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, n_fft_ctx_t F) p[7] -= F->mod2; // next line requires < 2n, hence the 8 reductions above - dft8_node0_lazy24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, F); - dft8_lazy44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, F); - dft8_lazy44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, F); - dft8_lazy44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F); + DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F->mod, F->mod2, F->tab_w); } /** 32-point DFT @@ -573,10 +577,10 @@ FLINT_FORCE_INLINE void dft32_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) DFT4_LAZY44(p[7], p[15], p[23], p[31], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); // next line requires < 2n, hence the four reductions above - dft8_lazy44(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, 4*node, F); - dft8_lazy44(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 4*node+1, F); - dft8_lazy44(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 4*node+2, F); - dft8_lazy44(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 4*node+3, F); + DFT8_LAZY44(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, 4*node, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 4*node+1, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 4*node+2, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 4*node+3, F->mod, F->mod2, F->tab_w); } /*-------------*/ @@ -599,7 +603,7 @@ FLINT_FORCE_INLINE void dft32_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_ctx_t F) { if (depth == 3) - dft8_lazy44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F); + DFT8_LAZY44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F->mod, F->mod2, F->tab_w); else if (depth == 4) dft16_lazy44(p, node, F); else if (depth == 5) @@ -653,7 +657,7 @@ void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_ctx_t F) void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_ctx_t F) { if (depth == 3) - dft8_node0_lazy24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); + DFT8_NODE0_LAZY24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F->mod, F->mod2, F->tab_w); else if (depth == 4) dft16_node0_lazy24(p, F); else if (depth == 5) @@ -713,7 +717,7 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) DFT4_NODE0_LAZY14(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); } else if (depth == 3) - dft8_node0_lazy14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F); + DFT8_NODE0_LAZY14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F->mod, F->mod2, F->tab_w); else if (depth == 4) dft16_node0_lazy14(p, F); else if (depth == 5) From f0587e51a3329ef98de07699c0ac406efd6699df Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 15:31:59 +0200 Subject: [PATCH 28/71] putting more args slightly slows down for large lengths... --- src/n_fft/dft.c | 390 ++++++++++++++++++++++++------------------------ 1 file changed, 195 insertions(+), 195 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index cf2f25cb83..cc95fa553c 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -307,134 +307,134 @@ do { \ * * in [0..n) / out [0..4n) / max < 8n * * Apart from this range, same specification as dft_node0_lazy24, for depth==4 */ -FLINT_FORCE_INLINE void dft16_node0_lazy14(nn_ptr p, n_fft_ctx_t F) +FLINT_FORCE_INLINE void dft16_node0_lazy14(nn_ptr p, ulong mod, ulong mod2, nn_ptr tab_w) { ulong p_hi, p_lo, tmp; - DFT4_NODE0_LAZY14(p[0], p[4], p[ 8], p[12], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p[0] >= F->mod2) - p[0] -= F->mod2; - DFT4_NODE0_LAZY14(p[1], p[5], p[ 9], p[13], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p[1] >= F->mod2) - p[1] -= F->mod2; - DFT4_NODE0_LAZY14(p[2], p[6], p[10], p[14], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p[2] >= F->mod2) - p[2] -= F->mod2; - DFT4_NODE0_LAZY14(p[3], p[7], p[11], p[15], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p[3] >= F->mod2) - p[3] -= F->mod2; + DFT4_NODE0_LAZY14(p[0], p[4], p[ 8], p[12], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); + if (p[0] >= mod2) + p[0] -= mod2; + DFT4_NODE0_LAZY14(p[1], p[5], p[ 9], p[13], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); + if (p[1] >= mod2) + p[1] -= mod2; + DFT4_NODE0_LAZY14(p[2], p[6], p[10], p[14], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); + if (p[2] >= mod2) + p[2] -= mod2; + DFT4_NODE0_LAZY14(p[3], p[7], p[11], p[15], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); + if (p[3] >= mod2) + p[3] -= mod2; // next line requires < 2n, hence the four reductions above - DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); DFT4_LAZY44(p[4], p[5], p[6], p[7], - F->tab_w[2], F->tab_w[3], - F->tab_w[4], F->tab_w[5], - F->tab_w[6], F->tab_w[7], - F->mod, F->mod2, p_hi, p_lo, tmp); + tab_w[2], tab_w[3], + tab_w[4], tab_w[5], + tab_w[6], tab_w[7], + mod, mod2, p_hi, p_lo, tmp); DFT4_LAZY44(p[8], p[9], p[10], p[11], - F->tab_w[4], F->tab_w[5], - F->tab_w[8], F->tab_w[9], - F->tab_w[10], F->tab_w[11], - F->mod, F->mod2, p_hi, p_lo, tmp); + tab_w[4], tab_w[5], + tab_w[8], tab_w[9], + tab_w[10], tab_w[11], + mod, mod2, p_hi, p_lo, tmp); DFT4_LAZY44(p[12], p[13], p[14], p[15], - F->tab_w[6], F->tab_w[7], - F->tab_w[12], F->tab_w[13], - F->tab_w[14], F->tab_w[15], - F->mod, F->mod2, p_hi, p_lo, tmp); + tab_w[6], tab_w[7], + tab_w[12], tab_w[13], + tab_w[14], tab_w[15], + mod, mod2, p_hi, p_lo, tmp); } /** 16-point DFT, node 0 * * in [0..2n) / out [0..4n) / max < 8n * * Same specification as dft_node0_lazy24, for depth==4 */ -FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, n_fft_ctx_t F) +FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, ulong mod, ulong mod2, nn_ptr tab_w) { ulong p_hi, p_lo, tmp; - DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p[0] >= F->mod2) - p[0] -= F->mod2; - DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p[1] >= F->mod2) - p[1] -= F->mod2; - DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p[2] >= F->mod2) - p[2] -= F->mod2; - DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p[3] >= F->mod2) - p[3] -= F->mod2; + DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); + if (p[0] >= mod2) + p[0] -= mod2; + DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); + if (p[1] >= mod2) + p[1] -= mod2; + DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); + if (p[2] >= mod2) + p[2] -= mod2; + DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); + if (p[3] >= mod2) + p[3] -= mod2; // next line requires < 2n, hence the four reductions above - DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); DFT4_LAZY44(p[4], p[5], p[6], p[7], - F->tab_w[2], F->tab_w[3], - F->tab_w[4], F->tab_w[5], - F->tab_w[6], F->tab_w[7], - F->mod, F->mod2, p_hi, p_lo, tmp); + tab_w[2], tab_w[3], + tab_w[4], tab_w[5], + tab_w[6], tab_w[7], + mod, mod2, p_hi, p_lo, tmp); DFT4_LAZY44(p[8], p[9], p[10], p[11], - F->tab_w[4], F->tab_w[5], - F->tab_w[8], F->tab_w[9], - F->tab_w[10], F->tab_w[11], - F->mod, F->mod2, p_hi, p_lo, tmp); + tab_w[4], tab_w[5], + tab_w[8], tab_w[9], + tab_w[10], tab_w[11], + mod, mod2, p_hi, p_lo, tmp); DFT4_LAZY44(p[12], p[13], p[14], p[15], - F->tab_w[6], F->tab_w[7], - F->tab_w[12], F->tab_w[13], - F->tab_w[14], F->tab_w[15], - F->mod, F->mod2, p_hi, p_lo, tmp); + tab_w[6], tab_w[7], + tab_w[12], tab_w[13], + tab_w[14], tab_w[15], + mod, mod2, p_hi, p_lo, tmp); } /** 16-point DFT * * in [0..4n) / out [0..4n) / max < 8n * * Same specification as dft_lazy44, for depth==4 */ -FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) +FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, ulong mod, ulong mod2, nn_ptr tab_w) { ulong p_hi, p_lo, tmp; ulong w2, w2pre, w, wpre, Iw, Iwpre; - w2 = F->tab_w[2*node]; - w2pre = F->tab_w[2*node+1]; - w = F->tab_w[4*node]; - wpre = F->tab_w[4*node+1]; - Iw = F->tab_w[4*node+2]; - Iwpre = F->tab_w[4*node+3]; - - DFT4_LAZY44(p[0], p[4], p[ 8], p[12], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[1], p[5], p[ 9], p[13], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[2], p[6], p[10], p[14], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[3], p[7], p[11], p[15], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - - w2 = F->tab_w[8*node]; - w2pre = F->tab_w[8*node+1]; - w = F->tab_w[16*node]; - wpre = F->tab_w[16*node+1]; - Iw = F->tab_w[16*node+2]; - Iwpre = F->tab_w[16*node+3]; - DFT4_LAZY44(p[0], p[1], p[2], p[3], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - - w2 = F->tab_w[8*node+2]; - w2pre = F->tab_w[8*node+3]; - w = F->tab_w[16*node+4]; - wpre = F->tab_w[16*node+5]; - Iw = F->tab_w[16*node+6]; - Iwpre = F->tab_w[16*node+7]; - DFT4_LAZY44(p[4], p[5], p[6], p[7], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - - w2 = F->tab_w[8*node+4]; - w2pre = F->tab_w[8*node+5]; - w = F->tab_w[16*node+8]; - wpre = F->tab_w[16*node+9]; - Iw = F->tab_w[16*node+10]; - Iwpre = F->tab_w[16*node+11]; - DFT4_LAZY44(p[8], p[9], p[10], p[11], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - - w2 = F->tab_w[8*node+6]; - w2pre = F->tab_w[8*node+7]; - w = F->tab_w[16*node+12]; - wpre = F->tab_w[16*node+13]; - Iw = F->tab_w[16*node+14]; - Iwpre = F->tab_w[16*node+15]; - DFT4_LAZY44(p[12], p[13], p[14], p[15], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + w2 = tab_w[2*node]; + w2pre = tab_w[2*node+1]; + w = tab_w[4*node]; + wpre = tab_w[4*node+1]; + Iw = tab_w[4*node+2]; + Iwpre = tab_w[4*node+3]; + + DFT4_LAZY44(p[0], p[4], p[ 8], p[12], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[1], p[5], p[ 9], p[13], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[2], p[6], p[10], p[14], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[3], p[7], p[11], p[15], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + + w2 = tab_w[8*node]; + w2pre = tab_w[8*node+1]; + w = tab_w[16*node]; + wpre = tab_w[16*node+1]; + Iw = tab_w[16*node+2]; + Iwpre = tab_w[16*node+3]; + DFT4_LAZY44(p[0], p[1], p[2], p[3], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + + w2 = tab_w[8*node+2]; + w2pre = tab_w[8*node+3]; + w = tab_w[16*node+4]; + wpre = tab_w[16*node+5]; + Iw = tab_w[16*node+6]; + Iwpre = tab_w[16*node+7]; + DFT4_LAZY44(p[4], p[5], p[6], p[7], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + + w2 = tab_w[8*node+4]; + w2pre = tab_w[8*node+5]; + w = tab_w[16*node+8]; + wpre = tab_w[16*node+9]; + Iw = tab_w[16*node+10]; + Iwpre = tab_w[16*node+11]; + DFT4_LAZY44(p[8], p[9], p[10], p[11], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + + w2 = tab_w[8*node+6]; + w2pre = tab_w[8*node+7]; + w = tab_w[16*node+12]; + wpre = tab_w[16*node+13]; + Iw = tab_w[16*node+14]; + Iwpre = tab_w[16*node+15]; + DFT4_LAZY44(p[12], p[13], p[14], p[15], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); } /*--------------*/ @@ -445,142 +445,142 @@ FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) * * in [0..n) / out [0..4n) / max < 8n * * Apart from this range, same specification as dft_node0_lazy24, for depth==5 */ -FLINT_FORCE_INLINE void dft32_node0_lazy14(nn_ptr p, n_fft_ctx_t F) +FLINT_FORCE_INLINE void dft32_node0_lazy14(nn_ptr p, ulong mod, ulong mod2, nn_ptr tab_w) { ulong p_hi, p_lo; DFT4_NODE0_LAZY14(p[0], p[8 ], p[16], p[24], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[0] >= F->mod2) - p[0] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[0] >= mod2) + p[0] -= mod2; DFT4_NODE0_LAZY14(p[1], p[9 ], p[17], p[25], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[1] >= F->mod2) - p[1] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[1] >= mod2) + p[1] -= mod2; DFT4_NODE0_LAZY14(p[2], p[10], p[18], p[26], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[2] >= F->mod2) - p[2] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[2] >= mod2) + p[2] -= mod2; DFT4_NODE0_LAZY14(p[3], p[11], p[19], p[27], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[3] >= F->mod2) - p[3] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[3] >= mod2) + p[3] -= mod2; DFT4_NODE0_LAZY14(p[4], p[12], p[20], p[28], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[4] >= F->mod2) - p[4] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[4] >= mod2) + p[4] -= mod2; DFT4_NODE0_LAZY14(p[5], p[13], p[21], p[29], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[5] >= F->mod2) - p[5] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[5] >= mod2) + p[5] -= mod2; DFT4_NODE0_LAZY14(p[6], p[14], p[22], p[30], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[6] >= F->mod2) - p[6] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[6] >= mod2) + p[6] -= mod2; DFT4_NODE0_LAZY14(p[7], p[15], p[23], p[31], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[7] >= F->mod2) - p[7] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[7] >= mod2) + p[7] -= mod2; // next line requires < 2n, hence the 8 reductions above - DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, F->mod, F->mod2, F->tab_w); - DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, F->mod, F->mod2, F->tab_w); - DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, F->mod, F->mod2, F->tab_w); - DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F->mod, F->mod2, F->tab_w); + DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, mod, mod2, tab_w); + DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, mod, mod2, tab_w); + DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, mod, mod2, tab_w); + DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, mod, mod2, tab_w); } /** 32-point DFT, node 0 * * in [0..2n) / out [0..4n) / max < 8n * * Same specification as dft_node0_lazy24, for depth==5 */ -FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, n_fft_ctx_t F) +FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, ulong mod, ulong mod2, nn_ptr tab_w) { ulong p_hi, p_lo; DFT4_NODE0_LAZY24(p[0], p[8 ], p[16], p[24], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[0] >= F->mod2) - p[0] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[0] >= mod2) + p[0] -= mod2; DFT4_NODE0_LAZY24(p[1], p[9 ], p[17], p[25], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[1] >= F->mod2) - p[1] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[1] >= mod2) + p[1] -= mod2; DFT4_NODE0_LAZY24(p[2], p[10], p[18], p[26], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[2] >= F->mod2) - p[2] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[2] >= mod2) + p[2] -= mod2; DFT4_NODE0_LAZY24(p[3], p[11], p[19], p[27], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[3] >= F->mod2) - p[3] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[3] >= mod2) + p[3] -= mod2; DFT4_NODE0_LAZY24(p[4], p[12], p[20], p[28], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[4] >= F->mod2) - p[4] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[4] >= mod2) + p[4] -= mod2; DFT4_NODE0_LAZY24(p[5], p[13], p[21], p[29], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[5] >= F->mod2) - p[5] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[5] >= mod2) + p[5] -= mod2; DFT4_NODE0_LAZY24(p[6], p[14], p[22], p[30], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[6] >= F->mod2) - p[6] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[6] >= mod2) + p[6] -= mod2; DFT4_NODE0_LAZY24(p[7], p[15], p[23], p[31], - F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - if (p[7] >= F->mod2) - p[7] -= F->mod2; + tab_w[2], tab_w[3], + mod, mod2, p_hi, p_lo); + if (p[7] >= mod2) + p[7] -= mod2; // next line requires < 2n, hence the 8 reductions above - DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, F->mod, F->mod2, F->tab_w); - DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, F->mod, F->mod2, F->tab_w); - DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, F->mod, F->mod2, F->tab_w); - DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, F->mod, F->mod2, F->tab_w); + DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, mod, mod2, tab_w); + DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, mod, mod2, tab_w); + DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, mod, mod2, tab_w); + DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, mod, mod2, tab_w); } /** 32-point DFT * * in [0..4n) / out [0..4n) / max < 8n * * Same specification as dft_lazy44, for depth==5 */ -FLINT_FORCE_INLINE void dft32_lazy44(nn_ptr p, ulong node, n_fft_ctx_t F) +FLINT_FORCE_INLINE void dft32_lazy44(nn_ptr p, ulong node, ulong mod, ulong mod2, nn_ptr tab_w) { ulong p_hi, p_lo, tmp; - ulong w2 = F->tab_w[2*node]; - ulong w2pre = F->tab_w[2*node+1]; - ulong w = F->tab_w[4*node]; - ulong wpre = F->tab_w[4*node+1]; - ulong Iw = F->tab_w[4*node+2]; - ulong Iwpre = F->tab_w[4*node+3]; - DFT4_LAZY44(p[0], p[ 8], p[16], p[24], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[1], p[ 9], p[17], p[25], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[2], p[10], p[18], p[26], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[3], p[11], p[19], p[27], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[4], p[12], p[20], p[28], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[5], p[13], p[21], p[29], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[6], p[14], p[22], p[30], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[7], p[15], p[23], p[31], w2, w2pre, w, wpre, Iw, Iwpre, F->mod, F->mod2, p_hi, p_lo, tmp); + ulong w2 = tab_w[2*node]; + ulong w2pre = tab_w[2*node+1]; + ulong w = tab_w[4*node]; + ulong wpre = tab_w[4*node+1]; + ulong Iw = tab_w[4*node+2]; + ulong Iwpre = tab_w[4*node+3]; + DFT4_LAZY44(p[0], p[ 8], p[16], p[24], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[1], p[ 9], p[17], p[25], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[2], p[10], p[18], p[26], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[3], p[11], p[19], p[27], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[4], p[12], p[20], p[28], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[5], p[13], p[21], p[29], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[6], p[14], p[22], p[30], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); + DFT4_LAZY44(p[7], p[15], p[23], p[31], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); // next line requires < 2n, hence the four reductions above - DFT8_LAZY44(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, 4*node, F->mod, F->mod2, F->tab_w); - DFT8_LAZY44(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 4*node+1, F->mod, F->mod2, F->tab_w); - DFT8_LAZY44(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 4*node+2, F->mod, F->mod2, F->tab_w); - DFT8_LAZY44(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 4*node+3, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, 4*node, mod, mod2, tab_w); + DFT8_LAZY44(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 4*node+1, mod, mod2, tab_w); + DFT8_LAZY44(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 4*node+2, mod, mod2, tab_w); + DFT8_LAZY44(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 4*node+3, mod, mod2, tab_w); } /*-------------*/ @@ -605,9 +605,9 @@ void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_ctx_t F) if (depth == 3) DFT8_LAZY44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F->mod, F->mod2, F->tab_w); else if (depth == 4) - dft16_lazy44(p, node, F); + dft16_lazy44(p, node, F->mod, F->mod2, F->tab_w); else if (depth == 5) - dft32_lazy44(p, node, F); + dft32_lazy44(p, node, F->mod, F->mod2, F->tab_w); else { const ulong len = UWORD(1) << depth; @@ -659,9 +659,9 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_ctx_t F) if (depth == 3) DFT8_NODE0_LAZY24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F->mod, F->mod2, F->tab_w); else if (depth == 4) - dft16_node0_lazy24(p, F); + dft16_node0_lazy24(p, F->mod, F->mod2, F->tab_w); else if (depth == 5) - dft32_node0_lazy24(p, F); + dft32_node0_lazy24(p, F->mod, F->mod2, F->tab_w); else { const ulong len = UWORD(1) << depth; @@ -719,9 +719,9 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) else if (depth == 3) DFT8_NODE0_LAZY14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F->mod, F->mod2, F->tab_w); else if (depth == 4) - dft16_node0_lazy14(p, F); + dft16_node0_lazy14(p, F->mod, F->mod2, F->tab_w); else if (depth == 5) - dft32_node0_lazy14(p, F); + dft32_node0_lazy14(p, F->mod, F->mod2, F->tab_w); else { const ulong len = UWORD(1) << depth; From 4cf73439f0aa4645b4548cea948d399254160aa4 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 15:44:19 +0200 Subject: [PATCH 29/71] macro for dft16 helps, let's see for dft32 --- src/n_fft/dft.c | 282 +++++++++++++++++++++++++++--------------------- 1 file changed, 160 insertions(+), 122 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index cc95fa553c..5401fde755 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -307,135 +307,173 @@ do { \ * * in [0..n) / out [0..4n) / max < 8n * * Apart from this range, same specification as dft_node0_lazy24, for depth==4 */ -FLINT_FORCE_INLINE void dft16_node0_lazy14(nn_ptr p, ulong mod, ulong mod2, nn_ptr tab_w) -{ - ulong p_hi, p_lo, tmp; - - DFT4_NODE0_LAZY14(p[0], p[4], p[ 8], p[12], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); - if (p[0] >= mod2) - p[0] -= mod2; - DFT4_NODE0_LAZY14(p[1], p[5], p[ 9], p[13], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); - if (p[1] >= mod2) - p[1] -= mod2; - DFT4_NODE0_LAZY14(p[2], p[6], p[10], p[14], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); - if (p[2] >= mod2) - p[2] -= mod2; - DFT4_NODE0_LAZY14(p[3], p[7], p[11], p[15], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); - if (p[3] >= mod2) - p[3] -= mod2; - - // next line requires < 2n, hence the four reductions above - DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); - DFT4_LAZY44(p[4], p[5], p[6], p[7], - tab_w[2], tab_w[3], - tab_w[4], tab_w[5], - tab_w[6], tab_w[7], - mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[8], p[9], p[10], p[11], - tab_w[4], tab_w[5], - tab_w[8], tab_w[9], - tab_w[10], tab_w[11], - mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[12], p[13], p[14], p[15], - tab_w[6], tab_w[7], - tab_w[12], tab_w[13], - tab_w[14], tab_w[15], - mod, mod2, p_hi, p_lo, tmp); -} +#define DFT16_NODE0_LAZY14(p, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + \ + DFT4_NODE0_LAZY14(p[0], p[4], p[ 8], p[12], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[0] >= mod2) \ + p[0] -= mod2; \ + DFT4_NODE0_LAZY14(p[1], p[5], p[ 9], p[13], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[1] >= mod2) \ + p[1] -= mod2; \ + DFT4_NODE0_LAZY14(p[2], p[6], p[10], p[14], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[2] >= mod2) \ + p[2] -= mod2; \ + DFT4_NODE0_LAZY14(p[3], p[7], p[11], p[15], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[3] >= mod2) \ + p[3] -= mod2; \ + \ + /* next line requires < 2n, */ \ + /* hence the four reductions above */ \ + DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + DFT4_LAZY44(p[4], p[5], p[6], p[7], \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[8], p[9], p[10], p[11], \ + tab_w[4], tab_w[5], \ + tab_w[8], tab_w[9], \ + tab_w[10], tab_w[11], \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[12], p[13], p[14], p[15], \ + tab_w[6], tab_w[7], \ + tab_w[12], tab_w[13], \ + tab_w[14], tab_w[15], \ + mod, mod2, p_hi, p_lo, tmp); \ +} while(0) /** 16-point DFT, node 0 * * in [0..2n) / out [0..4n) / max < 8n * * Same specification as dft_node0_lazy24, for depth==4 */ -FLINT_FORCE_INLINE void dft16_node0_lazy24(nn_ptr p, ulong mod, ulong mod2, nn_ptr tab_w) -{ - ulong p_hi, p_lo, tmp; - - DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); - if (p[0] >= mod2) - p[0] -= mod2; - DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); - if (p[1] >= mod2) - p[1] -= mod2; - DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); - if (p[2] >= mod2) - p[2] -= mod2; - DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); - if (p[3] >= mod2) - p[3] -= mod2; - - // next line requires < 2n, hence the four reductions above - DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], tab_w[2], tab_w[3], mod, mod2, p_hi, p_lo); - DFT4_LAZY44(p[4], p[5], p[6], p[7], - tab_w[2], tab_w[3], - tab_w[4], tab_w[5], - tab_w[6], tab_w[7], - mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[8], p[9], p[10], p[11], - tab_w[4], tab_w[5], - tab_w[8], tab_w[9], - tab_w[10], tab_w[11], - mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[12], p[13], p[14], p[15], - tab_w[6], tab_w[7], - tab_w[12], tab_w[13], - tab_w[14], tab_w[15], - mod, mod2, p_hi, p_lo, tmp); -} +#define DFT16_NODE0_LAZY24(p, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + \ + DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[0] >= mod2) \ + p[0] -= mod2; \ + DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[1] >= mod2) \ + p[1] -= mod2; \ + DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[2] >= mod2) \ + p[2] -= mod2; \ + DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[3] >= mod2) \ + p[3] -= mod2; \ + \ + /* next line requires < 2n, */ \ + /* hence the four reductions above */ \ + DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + DFT4_LAZY44(p[4], p[5], p[6], p[7], \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[8], p[9], p[10], p[11], \ + tab_w[4], tab_w[5], \ + tab_w[8], tab_w[9], \ + tab_w[10], tab_w[11], \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[12], p[13], p[14], p[15], \ + tab_w[6], tab_w[7], \ + tab_w[12], tab_w[13], \ + tab_w[14], tab_w[15], \ + mod, mod2, p_hi, p_lo, tmp); \ +} while(0) /** 16-point DFT * * in [0..4n) / out [0..4n) / max < 8n * * Same specification as dft_lazy44, for depth==4 */ -FLINT_FORCE_INLINE void dft16_lazy44(nn_ptr p, ulong node, ulong mod, ulong mod2, nn_ptr tab_w) -{ - ulong p_hi, p_lo, tmp; - ulong w2, w2pre, w, wpre, Iw, Iwpre; - - w2 = tab_w[2*node]; - w2pre = tab_w[2*node+1]; - w = tab_w[4*node]; - wpre = tab_w[4*node+1]; - Iw = tab_w[4*node+2]; - Iwpre = tab_w[4*node+3]; - - DFT4_LAZY44(p[0], p[4], p[ 8], p[12], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[1], p[5], p[ 9], p[13], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[2], p[6], p[10], p[14], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[3], p[7], p[11], p[15], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - - w2 = tab_w[8*node]; - w2pre = tab_w[8*node+1]; - w = tab_w[16*node]; - wpre = tab_w[16*node+1]; - Iw = tab_w[16*node+2]; - Iwpre = tab_w[16*node+3]; - DFT4_LAZY44(p[0], p[1], p[2], p[3], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - - w2 = tab_w[8*node+2]; - w2pre = tab_w[8*node+3]; - w = tab_w[16*node+4]; - wpre = tab_w[16*node+5]; - Iw = tab_w[16*node+6]; - Iwpre = tab_w[16*node+7]; - DFT4_LAZY44(p[4], p[5], p[6], p[7], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - - w2 = tab_w[8*node+4]; - w2pre = tab_w[8*node+5]; - w = tab_w[16*node+8]; - wpre = tab_w[16*node+9]; - Iw = tab_w[16*node+10]; - Iwpre = tab_w[16*node+11]; - DFT4_LAZY44(p[8], p[9], p[10], p[11], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - - w2 = tab_w[8*node+6]; - w2pre = tab_w[8*node+7]; - w = tab_w[16*node+12]; - wpre = tab_w[16*node+13]; - Iw = tab_w[16*node+14]; - Iwpre = tab_w[16*node+15]; - DFT4_LAZY44(p[12], p[13], p[14], p[15], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); -} +#define DFT16_LAZY44(p, node, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + ulong w2, w2pre, w, wpre, Iw, Iwpre; \ + \ + w2 = tab_w[2*node]; \ + w2pre = tab_w[2*node+1]; \ + w = tab_w[4*node]; \ + wpre = tab_w[4*node+1]; \ + Iw = tab_w[4*node+2]; \ + Iwpre = tab_w[4*node+3]; \ + \ + DFT4_LAZY44(p[0], p[4], p[ 8], p[12], \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[1], p[5], p[ 9], p[13], \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[2], p[6], p[10], p[14], \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[3], p[7], p[11], p[15], \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + \ + w2 = tab_w[8*node]; \ + w2pre = tab_w[8*node+1]; \ + w = tab_w[16*node]; \ + wpre = tab_w[16*node+1]; \ + Iw = tab_w[16*node+2]; \ + Iwpre = tab_w[16*node+3]; \ + DFT4_LAZY44(p[0], p[1], p[2], p[3], \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + \ + w2 = tab_w[8*node+2]; \ + w2pre = tab_w[8*node+3]; \ + w = tab_w[16*node+4]; \ + wpre = tab_w[16*node+5]; \ + Iw = tab_w[16*node+6]; \ + Iwpre = tab_w[16*node+7]; \ + DFT4_LAZY44(p[4], p[5], p[6], p[7], \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + \ + w2 = tab_w[8*node+4]; \ + w2pre = tab_w[8*node+5]; \ + w = tab_w[16*node+8]; \ + wpre = tab_w[16*node+9]; \ + Iw = tab_w[16*node+10]; \ + Iwpre = tab_w[16*node+11]; \ + DFT4_LAZY44(p[8], p[9], p[10], p[11], \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + \ + w2 = tab_w[8*node+6]; \ + w2pre = tab_w[8*node+7]; \ + w = tab_w[16*node+12]; \ + wpre = tab_w[16*node+13]; \ + Iw = tab_w[16*node+14]; \ + Iwpre = tab_w[16*node+15]; \ + DFT4_LAZY44(p[12], p[13], p[14], p[15], \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ +} while(0) /*--------------*/ /* 32-point DFT */ @@ -605,7 +643,7 @@ void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_ctx_t F) if (depth == 3) DFT8_LAZY44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F->mod, F->mod2, F->tab_w); else if (depth == 4) - dft16_lazy44(p, node, F->mod, F->mod2, F->tab_w); + DFT16_LAZY44(p, node, F->mod, F->mod2, F->tab_w); else if (depth == 5) dft32_lazy44(p, node, F->mod, F->mod2, F->tab_w); else @@ -659,7 +697,7 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_ctx_t F) if (depth == 3) DFT8_NODE0_LAZY24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F->mod, F->mod2, F->tab_w); else if (depth == 4) - dft16_node0_lazy24(p, F->mod, F->mod2, F->tab_w); + DFT16_NODE0_LAZY24(p, F->mod, F->mod2, F->tab_w); else if (depth == 5) dft32_node0_lazy24(p, F->mod, F->mod2, F->tab_w); else @@ -719,7 +757,7 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) else if (depth == 3) DFT8_NODE0_LAZY14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F->mod, F->mod2, F->tab_w); else if (depth == 4) - dft16_node0_lazy14(p, F->mod, F->mod2, F->tab_w); + DFT16_NODE0_LAZY14(p, F->mod, F->mod2, F->tab_w); else if (depth == 5) dft32_node0_lazy14(p, F->mod, F->mod2, F->tab_w); else From b96e0a5dd66119a17ec50032789f22c09add011e Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 15:51:30 +0200 Subject: [PATCH 30/71] dft32 macroified does help a bit --- src/n_fft/dft.c | 260 ++++++++++++++++++++++++------------------------ 1 file changed, 130 insertions(+), 130 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 5401fde755..95280b4467 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -483,143 +483,143 @@ do { \ * * in [0..n) / out [0..4n) / max < 8n * * Apart from this range, same specification as dft_node0_lazy24, for depth==5 */ -FLINT_FORCE_INLINE void dft32_node0_lazy14(nn_ptr p, ulong mod, ulong mod2, nn_ptr tab_w) -{ - ulong p_hi, p_lo; - - DFT4_NODE0_LAZY14(p[0], p[8 ], p[16], p[24], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[0] >= mod2) - p[0] -= mod2; - DFT4_NODE0_LAZY14(p[1], p[9 ], p[17], p[25], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[1] >= mod2) - p[1] -= mod2; - DFT4_NODE0_LAZY14(p[2], p[10], p[18], p[26], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[2] >= mod2) - p[2] -= mod2; - DFT4_NODE0_LAZY14(p[3], p[11], p[19], p[27], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[3] >= mod2) - p[3] -= mod2; - DFT4_NODE0_LAZY14(p[4], p[12], p[20], p[28], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[4] >= mod2) - p[4] -= mod2; - DFT4_NODE0_LAZY14(p[5], p[13], p[21], p[29], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[5] >= mod2) - p[5] -= mod2; - DFT4_NODE0_LAZY14(p[6], p[14], p[22], p[30], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[6] >= mod2) - p[6] -= mod2; - DFT4_NODE0_LAZY14(p[7], p[15], p[23], p[31], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[7] >= mod2) - p[7] -= mod2; - - // next line requires < 2n, hence the 8 reductions above - DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, mod, mod2, tab_w); - DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, mod, mod2, tab_w); - DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, mod, mod2, tab_w); - DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, mod, mod2, tab_w); -} +#define DFT32_NODE0_LAZY14(p, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo; \ + \ + DFT4_NODE0_LAZY14(p[0], p[8 ], p[16], p[24], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[0] >= mod2) \ + p[0] -= mod2; \ + DFT4_NODE0_LAZY14(p[1], p[9 ], p[17], p[25], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[1] >= mod2) \ + p[1] -= mod2; \ + DFT4_NODE0_LAZY14(p[2], p[10], p[18], p[26], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[2] >= mod2) \ + p[2] -= mod2; \ + DFT4_NODE0_LAZY14(p[3], p[11], p[19], p[27], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[3] >= mod2) \ + p[3] -= mod2; \ + DFT4_NODE0_LAZY14(p[4], p[12], p[20], p[28], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[4] >= mod2) \ + p[4] -= mod2; \ + DFT4_NODE0_LAZY14(p[5], p[13], p[21], p[29], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[5] >= mod2) \ + p[5] -= mod2; \ + DFT4_NODE0_LAZY14(p[6], p[14], p[22], p[30], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[6] >= mod2) \ + p[6] -= mod2; \ + DFT4_NODE0_LAZY14(p[7], p[15], p[23], p[31], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[7] >= mod2) \ + p[7] -= mod2; \ + \ + /* next line requires < 2n, hence the 8 reductions above */ \ + DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, mod, mod2, tab_w); \ + DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, mod, mod2, tab_w); \ + DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, mod, mod2, tab_w); \ + DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, mod, mod2, tab_w); \ +} while(0) /** 32-point DFT, node 0 * * in [0..2n) / out [0..4n) / max < 8n * * Same specification as dft_node0_lazy24, for depth==5 */ -FLINT_FORCE_INLINE void dft32_node0_lazy24(nn_ptr p, ulong mod, ulong mod2, nn_ptr tab_w) -{ - ulong p_hi, p_lo; - - DFT4_NODE0_LAZY24(p[0], p[8 ], p[16], p[24], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[0] >= mod2) - p[0] -= mod2; - DFT4_NODE0_LAZY24(p[1], p[9 ], p[17], p[25], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[1] >= mod2) - p[1] -= mod2; - DFT4_NODE0_LAZY24(p[2], p[10], p[18], p[26], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[2] >= mod2) - p[2] -= mod2; - DFT4_NODE0_LAZY24(p[3], p[11], p[19], p[27], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[3] >= mod2) - p[3] -= mod2; - DFT4_NODE0_LAZY24(p[4], p[12], p[20], p[28], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[4] >= mod2) - p[4] -= mod2; - DFT4_NODE0_LAZY24(p[5], p[13], p[21], p[29], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[5] >= mod2) - p[5] -= mod2; - DFT4_NODE0_LAZY24(p[6], p[14], p[22], p[30], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[6] >= mod2) - p[6] -= mod2; - DFT4_NODE0_LAZY24(p[7], p[15], p[23], p[31], - tab_w[2], tab_w[3], - mod, mod2, p_hi, p_lo); - if (p[7] >= mod2) - p[7] -= mod2; - - // next line requires < 2n, hence the 8 reductions above - DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, mod, mod2, tab_w); - DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, mod, mod2, tab_w); - DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, mod, mod2, tab_w); - DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, mod, mod2, tab_w); -} +#define DFT32_NODE0_LAZY24(p, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo; \ + \ + DFT4_NODE0_LAZY24(p[0], p[8 ], p[16], p[24], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[0] >= mod2) \ + p[0] -= mod2; \ + DFT4_NODE0_LAZY24(p[1], p[9 ], p[17], p[25], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[1] >= mod2) \ + p[1] -= mod2; \ + DFT4_NODE0_LAZY24(p[2], p[10], p[18], p[26], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[2] >= mod2) \ + p[2] -= mod2; \ + DFT4_NODE0_LAZY24(p[3], p[11], p[19], p[27], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[3] >= mod2) \ + p[3] -= mod2; \ + DFT4_NODE0_LAZY24(p[4], p[12], p[20], p[28], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[4] >= mod2) \ + p[4] -= mod2; \ + DFT4_NODE0_LAZY24(p[5], p[13], p[21], p[29], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[5] >= mod2) \ + p[5] -= mod2; \ + DFT4_NODE0_LAZY24(p[6], p[14], p[22], p[30], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[6] >= mod2) \ + p[6] -= mod2; \ + DFT4_NODE0_LAZY24(p[7], p[15], p[23], p[31], \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p[7] >= mod2) \ + p[7] -= mod2; \ + \ + /* next line requires < 2n, hence the 8 reductions above */ \ + DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, mod, mod2, tab_w); \ + DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, mod, mod2, tab_w); \ + DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, mod, mod2, tab_w); \ + DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, mod, mod2, tab_w); \ +} while(0) /** 32-point DFT * * in [0..4n) / out [0..4n) / max < 8n * * Same specification as dft_lazy44, for depth==5 */ -FLINT_FORCE_INLINE void dft32_lazy44(nn_ptr p, ulong node, ulong mod, ulong mod2, nn_ptr tab_w) -{ - ulong p_hi, p_lo, tmp; - - ulong w2 = tab_w[2*node]; - ulong w2pre = tab_w[2*node+1]; - ulong w = tab_w[4*node]; - ulong wpre = tab_w[4*node+1]; - ulong Iw = tab_w[4*node+2]; - ulong Iwpre = tab_w[4*node+3]; - DFT4_LAZY44(p[0], p[ 8], p[16], p[24], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[1], p[ 9], p[17], p[25], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[2], p[10], p[18], p[26], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[3], p[11], p[19], p[27], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[4], p[12], p[20], p[28], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[5], p[13], p[21], p[29], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[6], p[14], p[22], p[30], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - DFT4_LAZY44(p[7], p[15], p[23], p[31], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); - - // next line requires < 2n, hence the four reductions above - DFT8_LAZY44(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, 4*node, mod, mod2, tab_w); - DFT8_LAZY44(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 4*node+1, mod, mod2, tab_w); - DFT8_LAZY44(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 4*node+2, mod, mod2, tab_w); - DFT8_LAZY44(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 4*node+3, mod, mod2, tab_w); -} +#define DFT32_LAZY44(p, node, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + \ + ulong w2 = tab_w[2*node]; \ + ulong w2pre = tab_w[2*node+1]; \ + ulong w = tab_w[4*node]; \ + ulong wpre = tab_w[4*node+1]; \ + ulong Iw = tab_w[4*node+2]; \ + ulong Iwpre = tab_w[4*node+3]; \ + DFT4_LAZY44(p[0], p[ 8], p[16], p[24], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[1], p[ 9], p[17], p[25], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[2], p[10], p[18], p[26], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[3], p[11], p[19], p[27], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[4], p[12], p[20], p[28], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[5], p[13], p[21], p[29], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[6], p[14], p[22], p[30], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p[7], p[15], p[23], p[31], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + \ + /* next line requires < 2n, hence the four reductions above */ \ + DFT8_LAZY44(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, 4*node, mod, mod2, tab_w); \ + DFT8_LAZY44(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 4*node+1, mod, mod2, tab_w); \ + DFT8_LAZY44(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 4*node+2, mod, mod2, tab_w); \ + DFT8_LAZY44(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 4*node+3, mod, mod2, tab_w); \ +} while(0) /*-------------*/ /* general DFT */ @@ -645,7 +645,7 @@ void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_ctx_t F) else if (depth == 4) DFT16_LAZY44(p, node, F->mod, F->mod2, F->tab_w); else if (depth == 5) - dft32_lazy44(p, node, F->mod, F->mod2, F->tab_w); + DFT32_LAZY44(p, node, F->mod, F->mod2, F->tab_w); else { const ulong len = UWORD(1) << depth; @@ -699,7 +699,7 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_ctx_t F) else if (depth == 4) DFT16_NODE0_LAZY24(p, F->mod, F->mod2, F->tab_w); else if (depth == 5) - dft32_node0_lazy24(p, F->mod, F->mod2, F->tab_w); + DFT32_NODE0_LAZY24(p, F->mod, F->mod2, F->tab_w); else { const ulong len = UWORD(1) << depth; @@ -759,7 +759,7 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) else if (depth == 4) DFT16_NODE0_LAZY14(p, F->mod, F->mod2, F->tab_w); else if (depth == 5) - dft32_node0_lazy14(p, F->mod, F->mod2, F->tab_w); + DFT32_NODE0_LAZY14(p, F->mod, F->mod2, F->tab_w); else { const ulong len = UWORD(1) << depth; From 82a6c85a7e275f6b55dac20e65da7dd9d1d2e902 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Thu, 24 Oct 2024 23:45:33 +0200 Subject: [PATCH 31/71] mod4 currently unused --- src/n_fft.h | 2 +- src/n_fft/ctx_init.c | 2 +- src/n_fft/test/t-init.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 02ab93e77f..9f0dfccb43 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -35,7 +35,7 @@ typedef struct { ulong mod; // modulus, odd prime ulong mod2; // 2*mod (storing helps for speed) - ulong mod4; // 4*mod (storing helps for speed) + //ulong mod4; // 4*mod (storing helps for speed) ulong max_depth; // maximum supported depth (w has order 2**max_depth) ulong depth; // depth supported by current precomputation ulong * tab_w; // tabulated powers of w, see below diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c index 7a944cc72e..1136c29085 100644 --- a/src/n_fft/ctx_init.c +++ b/src/n_fft/ctx_init.c @@ -22,7 +22,7 @@ void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, // fill basic attributes F->mod = p; F->mod2 = 2*p; - F->mod4 = 4*p; + //F->mod4 = 4*p; F->max_depth = max_depth; F->depth = 3; // to be able to call fit_depth below diff --git a/src/n_fft/test/t-init.c b/src/n_fft/test/t-init.c index ae956156f5..2012dc69c1 100644 --- a/src/n_fft/test/t-init.c +++ b/src/n_fft/test/t-init.c @@ -41,8 +41,8 @@ int test_one(n_fft_ctx_t F, ulong max_depth, ulong depth, ulong p, flint_rand_t if (F->mod2 != 2*p) return 2; - if (F->mod4 != 4*p) - return 3; + //if (F->mod4 != 4*p) + // return 3; if (F->max_depth != max_depth) return 4; From 0cae5276169fe9e975f272d532c63a8ea2b8cb5a Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Fri, 25 Oct 2024 09:25:31 +0200 Subject: [PATCH 32/71] some notes / todos --- src/n_fft.h | 11 +++++++---- src/n_fft/dft.c | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 9f0dfccb43..292ee9a3db 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -21,10 +21,13 @@ extern "C" { #endif /** - * TODO[short term] confirm the limit on the modulus + * TODO[short term] augment precomputations with inverse roots + * TODO[short term] confirm the limit on the modulus (seems ok in dft; check via tests) * TODO[short term] add testing for general variants, not only node0 * TODO[longer term] large depth can lead to heavy memory usage * --> provide precomputation-free functions + * TODO[later] provide forward function which reduces output to [0..n) ? + * unclear this is useful... to be decided later */ /** n_fft context: @@ -114,9 +117,9 @@ void n_fft_ctx_clear(n_fft_ctx_t F); * at length a power of 2 */ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F); -void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F); -void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); -void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); +void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO +void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (idft on inverted roots) +void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (dft on inverted roots) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 95280b4467..9c5316c2a1 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -12,7 +12,8 @@ #include "longlong_asm_gcc.h" #include "n_fft.h" -// TODO provide function which reduces output to [0..n) ? +// TODO[later] provide function which reduces output to [0..n) ? +// unclear this is useful... to be decided later /*---------*/ /* helpers */ From 4818198f9b6ff13e318ba96bd1fc786547cbdf94 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Fri, 25 Oct 2024 23:01:54 +0200 Subject: [PATCH 33/71] test with prime close to announced limitw --- src/n_fft/dft.c | 3 --- src/n_fft/test/t-dft.c | 24 ++++++++++++++++++++---- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 9c5316c2a1..f34ab9967a 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -12,9 +12,6 @@ #include "longlong_asm_gcc.h" #include "n_fft.h" -// TODO[later] provide function which reduces output to [0..n) ? -// unclear this is useful... to be decided later - /*---------*/ /* helpers */ /*---------*/ diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c index 7176f116f0..8dd506a2a2 100644 --- a/src/n_fft/test/t-dft.c +++ b/src/n_fft/test/t-dft.c @@ -52,10 +52,26 @@ TEST_FUNCTION_START(n_fft_dft, state) for (i = 0; i < 100 * flint_test_multiplier(); i++) { // take some FFT prime p with max_depth >= 12 - ulong max_depth = 12 + n_randint(state, 10); - ulong prime = 1 + (UWORD(1) << max_depth); - while (! n_is_prime(prime)) - prime += (UWORD(1) << max_depth); + ulong max_depth, prime; + + // occasionally test large prime + // 61 bits: prime = 2305840260434624513 + // == 1 + 2**39 * 29 * 61 * 2371, log_2 == 60.999998 + // 29 bits: prime = 536608769 + // == 1 + 2**18 * 23 * 89, log_2 == 28.999295 + if (i % 10 == 0) +#if FLINT_BITS == 64 + prime = UWORD(2305840260434624513); +#else // FLINT_BITS == 32 + prime = UWORD(536608769); +#endif + else + { + max_depth = 12 + n_randint(state, 10); + prime = 1 + (UWORD(1) << max_depth); + while (! n_is_prime(prime)) + prime += (UWORD(1) << max_depth); + } max_depth = flint_ctz(prime-1); nmod_t mod; From c413b63f4fd25396b56cdfe4d72bf2fc548b53d2 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sat, 26 Oct 2024 00:10:57 +0200 Subject: [PATCH 34/71] limit is 62 bits --- src/n_fft.h | 1 - src/n_fft/dft.c | 36 ++++++++++++++++++------------------ src/n_fft/test/t-dft.c | 20 +++++++++----------- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 292ee9a3db..8bb640964d 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -22,7 +22,6 @@ extern "C" { /** * TODO[short term] augment precomputations with inverse roots - * TODO[short term] confirm the limit on the modulus (seems ok in dft; check via tests) * TODO[short term] add testing for general variants, not only node0 * TODO[longer term] large depth can lead to heavy memory usage * --> provide precomputation-free functions diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index f34ab9967a..2b34db2a66 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -9,7 +9,7 @@ (at your option) any later version. See . */ -#include "longlong_asm_gcc.h" +#include "longlong.h" #include "n_fft.h" /*---------*/ @@ -150,7 +150,7 @@ } while(0) /** 4-point DFT, general - * * in [0..4n) / out [0..4n) / max < 8n + * * in [0..4n) / out [0..4n) / max < 4n * * In-place transform * [ 1 1 1 1] * [w2 -w2 w3 -w3] @@ -190,13 +190,13 @@ do { \ \ N_MULMOD_PRECOMP_LAZY(u3, w1, u3, w1_pr, n, p_hi, p_lo); \ tmp = u1; \ - u1 = u1 + u3; /* [0..8n) */ \ - u3 = tmp + n2 - u3; /* [0..8n) */ \ + u1 = u1 + u3; /* [0..4n) */ \ + u3 = tmp + n2 - u3; /* [0..4n) */ \ \ N_MULMOD_PRECOMP_LAZY(u1, w2, u1, w2_pr, n, p_hi, p_lo); \ tmp = u0; \ - (a) = u0 + u1; /* [0..4n) */ \ - (b) = tmp + n2 - u1; /* [0..4n) */ \ + (a) = u0 + u1; /* [0..4n) */ \ + (b) = tmp + n2 - u1; /* [0..4n) */ \ \ N_MULMOD_PRECOMP_LAZY(u3, w3, u3, w3_pr, n, p_hi, p_lo); \ tmp = u2; \ @@ -209,7 +209,7 @@ do { \ /*-------------*/ /** 8-point DFT, node 0 - * * in [0..n) / out [0..4n) / max < 8n + * * in [0..n) / out [0..4n) / max < 4n * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations * p(1), p(-1), p(I), p(-I), p(J), p(-J), p(I*J), p(-I*J) @@ -240,7 +240,7 @@ do { \ } while(0) /** 8-point DFT, node 0 - * * in [0..2n) / out [0..4n) / max < 8n + * * in [0..2n) / out [0..4n) / max < 4n * * apart from these ranges, same specification as DFT8_NODE0_LAZY14 */ #define DFT8_NODE0_LAZY24(p0, p1, p2, p3, p4, p5, p6, p7, \ @@ -264,7 +264,7 @@ do { \ } while(0) /** 8-point DFT - * * in [0..4n) / out [0..4n) / max < 8n + * * in [0..4n) / out [0..4n) / max < 4n * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations * p(w0), p(-w0), p(w1), p(-w1), p(w2), p(-w2), p(w3), p(-w3) @@ -302,7 +302,7 @@ do { \ /*--------------*/ /** 16-point DFT, node 0 - * * in [0..n) / out [0..4n) / max < 8n + * * in [0..n) / out [0..4n) / max < 4n * * Apart from this range, same specification as dft_node0_lazy24, for depth==4 */ #define DFT16_NODE0_LAZY14(p, mod, mod2, tab_w) \ @@ -353,7 +353,7 @@ do { \ } while(0) /** 16-point DFT, node 0 - * * in [0..2n) / out [0..4n) / max < 8n + * * in [0..2n) / out [0..4n) / max < 4n * * Same specification as dft_node0_lazy24, for depth==4 */ #define DFT16_NODE0_LAZY24(p, mod, mod2, tab_w) \ @@ -404,7 +404,7 @@ do { \ } while(0) /** 16-point DFT - * * in [0..4n) / out [0..4n) / max < 8n + * * in [0..4n) / out [0..4n) / max < 4n * * Same specification as dft_lazy44, for depth==4 */ #define DFT16_LAZY44(p, node, mod, mod2, tab_w) \ @@ -478,7 +478,7 @@ do { \ /*--------------*/ /** 32-point DFT, node 0 - * * in [0..n) / out [0..4n) / max < 8n + * * in [0..n) / out [0..4n) / max < 4n * * Apart from this range, same specification as dft_node0_lazy24, for depth==5 */ #define DFT32_NODE0_LAZY14(p, mod, mod2, tab_w) \ @@ -534,7 +534,7 @@ do { } while(0) /** 32-point DFT, node 0 - * * in [0..2n) / out [0..4n) / max < 8n + * * in [0..2n) / out [0..4n) / max < 4n * * Same specification as dft_node0_lazy24, for depth==5 */ #define DFT32_NODE0_LAZY24(p, mod, mod2, tab_w) \ @@ -590,7 +590,7 @@ do { } while(0) /** 32-point DFT - * * in [0..4n) / out [0..4n) / max < 8n + * * in [0..4n) / out [0..4n) / max < 4n * * Same specification as dft_lazy44, for depth==5 */ #define DFT32_LAZY44(p, node, mod, mod2, tab_w) \ @@ -624,7 +624,7 @@ do { /*-------------*/ /** 2**depth-point DFT - * * in [0..4n) / out [0..4n) / max < 8n + * * in [0..4n) / out [0..4n) / max < 4n * * In-place transform p of length len == 2**depth into * the concatenation of * [sum(p[i] * w_k**i for i in range(len), sum(p[i] * (-w_k)**i for i in range(len)] @@ -679,7 +679,7 @@ void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_ctx_t F) } /** 2**depth-point DFT - * * in [0..2n) / out [0..4n) / max < 8n + * * in [0..2n) / out [0..4n) / max < 4n * * In-place transform p of length len == 2**depth into * the concatenation of * [sum(p[i] * w_k**i for i in range(len), sum(p[i] * (-w_k)**i for i in range(len)] @@ -726,7 +726,7 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_ctx_t F) } /** 2**depth-point DFT - * * in [0..n) / out [0..4n) / max < 8n + * * in [0..n) / out [0..4n) / max < 4n * * In-place transform p of length len == 2**depth into * the concatenation of * [sum(p[i] * w_k**i for i in range(len), sum(p[i] * (-w_k)**i for i in range(len)] diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c index 8dd506a2a2..022d938a85 100644 --- a/src/n_fft/test/t-dft.c +++ b/src/n_fft/test/t-dft.c @@ -49,21 +49,19 @@ TEST_FUNCTION_START(n_fft_dft, state) { int i; - for (i = 0; i < 100 * flint_test_multiplier(); i++) + for (i = 0; i < 200 * flint_test_multiplier(); i++) { // take some FFT prime p with max_depth >= 12 ulong max_depth, prime; - // occasionally test large prime - // 61 bits: prime = 2305840260434624513 - // == 1 + 2**39 * 29 * 61 * 2371, log_2 == 60.999998 - // 29 bits: prime = 536608769 - // == 1 + 2**18 * 23 * 89, log_2 == 28.999295 - if (i % 10 == 0) + // half of tests == large prime, close to limit + // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1 + // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1 + if (i > 100) #if FLINT_BITS == 64 - prime = UWORD(2305840260434624513); + prime = UWORD(4611686018427322369); #else // FLINT_BITS == 32 - prime = UWORD(536608769); + prime = UWORD(1073479681); #endif else { @@ -117,7 +115,7 @@ TEST_FUNCTION_START(n_fft_dft, state) "max_depth = %wu\n" "depth = %wu\n" "failed equality test\n", - p, F->tab_w2[2*(max_depth-2)], max_depth, depth); + prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); res = nmod_vec_range(p, len, 4*mod.n); @@ -128,7 +126,7 @@ TEST_FUNCTION_START(n_fft_dft, state) "max_depth = %wu\n" "depth = %wu\n" "failed range test\n", - p, F->tab_w2[2*(max_depth-2)], max_depth, depth); + prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); _nmod_vec_clear(p); nmod_poly_clear(pol); From 174bb2db6013f2ccdaef54ea54f6625e1626dcbd Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sat, 26 Oct 2024 00:50:53 +0200 Subject: [PATCH 35/71] fix assert and try struct for args... for circumventing strange behaviour for large arrays --- src/n_fft.h | 9 +++++++++ src/n_fft/ctx_init.c | 2 +- src/n_fft/dft.c | 16 +++++++++------- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 8bb640964d..68fd525eec 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -45,6 +45,15 @@ typedef struct } n_fft_ctx_struct; typedef n_fft_ctx_struct n_fft_ctx_t[1]; +typedef struct +{ + ulong mod; // modulus, odd prime + ulong mod2; // 2*mod (storing helps for speed) + ulong * tab_w; // tabulated powers of w, see below +} n_fft_params_struct; +typedef n_fft_params_struct n_fft_params_t[1]; + + /** Requirements (not checked upon init): * - mod is an odd prime < 2**61 * - max_depth must be >= 3 (so, 8 must divide mod - 1) diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c index 1136c29085..1a82a72692 100644 --- a/src/n_fft/ctx_init.c +++ b/src/n_fft/ctx_init.c @@ -60,7 +60,7 @@ void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p) { - FLINT_ASSERT(p > 2 && flint_clz(p) >= 3); // 2 < p < 2**61 + FLINT_ASSERT(p > 2 && flint_clz(p) >= 2); // 2 < p < 2**62 FLINT_ASSERT(flint_ctz(p - UWORD(1)) >= 3); // p-1 divisible by 8 // find the constant and exponent such that p == c * 2**max_depth + 1 diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 2b34db2a66..beae876d88 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -636,7 +636,7 @@ do { * 3 <= depth * (node+1) * 2**depth <= 2**F.depth (length of F->tab_w) */ -void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_ctx_t F) +void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_params_t F) { if (depth == 3) DFT8_LAZY44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F->mod, F->mod2, F->tab_w); @@ -719,9 +719,10 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_ctx_t F) // 4 recursive calls with depth-2 dft_node0_lazy24(p0, depth-2, F); - dft_lazy44(p1, depth-2, 1, F); - dft_lazy44(p2, depth-2, 2, F); - dft_lazy44(p3, depth-2, 3, F); + n_fft_params_t Fpar = {{F->mod, F->mod2, F->tab_w}}; + dft_lazy44(p1, depth-2, 1, Fpar); + dft_lazy44(p2, depth-2, 2, Fpar); + dft_lazy44(p3, depth-2, 3, Fpar); } } @@ -779,8 +780,9 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) // 4 recursive calls with depth-2 dft_node0_lazy24(p0, depth-2, F); - dft_lazy44(p1, depth-2, 1, F); - dft_lazy44(p2, depth-2, 2, F); - dft_lazy44(p3, depth-2, 3, F); + n_fft_params_t Fpar = {{F->mod, F->mod2, F->tab_w}}; + dft_lazy44(p1, depth-2, 1, Fpar); + dft_lazy44(p2, depth-2, 2, Fpar); + dft_lazy44(p3, depth-2, 3, Fpar); } } From c7f6dffb7d5ff7a4cc99e45a8d93493f70af0cc7 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 11:35:37 +0100 Subject: [PATCH 36/71] fft args and introducing iw --- src/n_fft.h | 59 +++++++++++------- src/n_fft/ctx_init.c | 2 - src/n_fft/dft.c | 133 +++++++++++++++++++++------------------- src/n_fft/test/t-init.c | 3 - 4 files changed, 106 insertions(+), 91 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 68fd525eec..caa63925e3 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -36,29 +36,21 @@ extern "C" { typedef struct { ulong mod; // modulus, odd prime - ulong mod2; // 2*mod (storing helps for speed) - //ulong mod4; // 4*mod (storing helps for speed) ulong max_depth; // maximum supported depth (w has order 2**max_depth) ulong depth; // depth supported by current precomputation - ulong * tab_w; // tabulated powers of w, see below + nn_ptr tab_w; // tabulated powers of w, see below + nn_ptr tab_iw; // tabulated powers of 1/w, see below ulong tab_w2[128]; // powers w**(2**k), see below + ulong tab_iw2[128]; // powers iw**(2**k), see below } n_fft_ctx_struct; typedef n_fft_ctx_struct n_fft_ctx_t[1]; -typedef struct -{ - ulong mod; // modulus, odd prime - ulong mod2; // 2*mod (storing helps for speed) - ulong * tab_w; // tabulated powers of w, see below -} n_fft_params_struct; -typedef n_fft_params_struct n_fft_params_t[1]; - - /** Requirements (not checked upon init): - * - mod is an odd prime < 2**61 + * - mod is an odd prime < 2**(FLINT_BITS-2) * - max_depth must be >= 3 (so, 8 must divide mod - 1) - * Total memory cost of precomputations: <= 128 + 2**(depth+1) ulong's - **/ + * Total memory cost of precomputations for arrays tab_{w,iw}: + * at most 2 * (128 + 2**depth) ulong's + */ /** tab_w2: * - length 128, with undefined entries at index 2*max_depth and beyond @@ -68,9 +60,12 @@ typedef n_fft_params_struct n_fft_params_t[1]; * and tab_w2[2*k+1] = floor(tab_w2[2*k] * 2**FLINT_BITS / mod) * -- for 2*max_depth <= k < 128, tab_w2[k] is undefined * - * The first elements are tab_w2 = [I, I_pr, J, J_pr, ...] - * where I is a square root of -1 and J is a square root of I. - **/ + * --> one can retrieve w as tab_w2[2 * (max_depth-2)] + * --> the first elements are tab_w2 = [I, I_pr, J, J_pr, ...] + * where I is a square root of -1 and J is a square root of I + */ + +/** tab_iw2: same as tab_w2 but for the primitive root 1/w */ /** tab_w: * - length 2**depth @@ -89,13 +84,14 @@ typedef n_fft_params_struct n_fft_params_t[1]; * not stored. **/ +/** tab_iw: same as tab_w but for the primitive root 1/w */ -/* note for init functions, when depth is provided: +/** Note for init functions, when depth is provided: * - if it is < 3, it is pretended that it is 3 * - it it is more than F->max_depth (the maximum possible with the given * prime), it is reduced to F->max_depth * After calling init, precomputations support DFTs of length up to 2**depth - **/ + */ // initialize with given root and given depth void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, ulong mod); @@ -104,10 +100,12 @@ void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p); // same, with default depth -FLINT_INLINE void n_fft_ctx_init_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong p) +FLINT_FORCE_INLINE +void n_fft_ctx_init_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong p) { n_fft_ctx_init2_root(F, w, max_depth, N_FFT_CTX_DEFAULT_DEPTH, p); } -FLINT_INLINE void n_fft_ctx_init(n_fft_ctx_t F, ulong p) +FLINT_FORCE_INLINE +void n_fft_ctx_init(n_fft_ctx_t F, ulong p) { n_fft_ctx_init2(F, N_FFT_CTX_DEFAULT_DEPTH, p); } // grows F->depth and precomputations to support DFTs of depth up to depth @@ -116,6 +114,23 @@ void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth); void n_fft_ctx_clear(n_fft_ctx_t F); +typedef struct +{ + ulong mod; // modulus, odd prime + ulong mod2; // 2*mod (storing helps for speed) + //ulong mod4; // 4*mod (storing helps for speed) + nn_srcptr tab_w; // tabulated powers of w, see below +} n_fft_args_struct; +typedef n_fft_args_struct n_fft_args_t[1]; + +FLINT_FORCE_INLINE +void n_fft_set_args(n_fft_args_t F, ulong mod, nn_srcptr tab_w) +{ + F->mod = mod; + F->mod2 = 2*mod; + F->tab_w = tab_w; +} + diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c index 1a82a72692..d984165bce 100644 --- a/src/n_fft/ctx_init.c +++ b/src/n_fft/ctx_init.c @@ -21,8 +21,6 @@ void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, // fill basic attributes F->mod = p; - F->mod2 = 2*p; - //F->mod4 = 4*p; F->max_depth = max_depth; F->depth = 3; // to be able to call fit_depth below diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index beae876d88..bc5a34a002 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -305,51 +305,53 @@ do { \ * * in [0..n) / out [0..4n) / max < 4n * * Apart from this range, same specification as dft_node0_lazy24, for depth==4 */ -#define DFT16_NODE0_LAZY14(p, mod, mod2, tab_w) \ -do { \ - ulong p_hi, p_lo, tmp; \ - \ - DFT4_NODE0_LAZY14(p[0], p[4], p[ 8], p[12], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[0] >= mod2) \ - p[0] -= mod2; \ - DFT4_NODE0_LAZY14(p[1], p[5], p[ 9], p[13], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[1] >= mod2) \ - p[1] -= mod2; \ - DFT4_NODE0_LAZY14(p[2], p[6], p[10], p[14], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[2] >= mod2) \ - p[2] -= mod2; \ - DFT4_NODE0_LAZY14(p[3], p[7], p[11], p[15], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[3] >= mod2) \ - p[3] -= mod2; \ - \ - /* next line requires < 2n, */ \ - /* hence the four reductions above */ \ - DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - DFT4_LAZY44(p[4], p[5], p[6], p[7], \ - tab_w[2], tab_w[3], \ - tab_w[4], tab_w[5], \ - tab_w[6], tab_w[7], \ - mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[8], p[9], p[10], p[11], \ - tab_w[4], tab_w[5], \ - tab_w[8], tab_w[9], \ - tab_w[10], tab_w[11], \ - mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[12], p[13], p[14], p[15], \ - tab_w[6], tab_w[7], \ - tab_w[12], tab_w[13], \ - tab_w[14], tab_w[15], \ - mod, mod2, p_hi, p_lo, tmp); \ +#define DFT16_NODE0_LAZY14(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + \ + DFT4_NODE0_LAZY14((p0), (p4), (p8), (p12), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if ((p0) >= mod2) \ + (p0) -= mod2; \ + DFT4_NODE0_LAZY14((p1), (p5), (p9), (p13), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if ((p1) >= mod2) \ + (p1) -= mod2; \ + DFT4_NODE0_LAZY14((p2), (p6), (p10), (p14), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if ((p2) >= mod2) \ + (p2) -= mod2; \ + DFT4_NODE0_LAZY14((p3), (p7), (p11), (p15), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if ((p3) >= mod2) \ + (p3) -= mod2; \ + \ + /* next line requires < 2n, */ \ + /* hence the four reductions above */ \ + DFT4_NODE0_LAZY24((p0), (p1), (p2), (p3), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + DFT4_LAZY44((p4), (p5), (p6), (p7), \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44((p8), (p9), (p10), (p11), \ + tab_w[4], tab_w[5], \ + tab_w[8], tab_w[9], \ + tab_w[10], tab_w[11], \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44((p12), (p13), (p14), (p15), \ + tab_w[6], tab_w[7], \ + tab_w[12], tab_w[13], \ + tab_w[14], tab_w[15], \ + mod, mod2, p_hi, p_lo, tmp); \ } while(0) /** 16-point DFT, node 0 @@ -636,7 +638,7 @@ do { * 3 <= depth * (node+1) * 2**depth <= 2**F.depth (length of F->tab_w) */ -void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_params_t F) +void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) { if (depth == 3) DFT8_LAZY44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F->mod, F->mod2, F->tab_w); @@ -690,7 +692,7 @@ void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_params_t F) * root of unity with exponents listed in bit reversed order * * Requirements (not checked): 3 <= depth <= F.depth */ -void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_ctx_t F) +void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_args_t F) { if (depth == 3) DFT8_NODE0_LAZY24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F->mod, F->mod2, F->tab_w); @@ -719,10 +721,9 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_ctx_t F) // 4 recursive calls with depth-2 dft_node0_lazy24(p0, depth-2, F); - n_fft_params_t Fpar = {{F->mod, F->mod2, F->tab_w}}; - dft_lazy44(p1, depth-2, 1, Fpar); - dft_lazy44(p2, depth-2, 2, Fpar); - dft_lazy44(p3, depth-2, 3, Fpar); + dft_lazy44(p1, depth-2, 1, F); + dft_lazy44(p2, depth-2, 2, F); + dft_lazy44(p3, depth-2, 3, F); } } @@ -743,22 +744,27 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) if (depth == 0) return; + n_fft_args_t Fargs; + n_fft_set_args(Fargs, F->mod, F->tab_w); + if (depth == 1) { ulong tmp; - DFT2_NODE0_LAZY12(p[0], p[1], F->mod, tmp); + DFT2_NODE0_LAZY12(p[0], p[1], Fargs->mod, tmp); } else if (depth == 2) { ulong p_hi, p_lo; - DFT4_NODE0_LAZY14(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY14(p[0], p[1], p[2], p[3], Fargs->tab_w[2], Fargs->tab_w[3], Fargs->mod, Fargs->mod2, p_hi, p_lo); } else if (depth == 3) - DFT8_NODE0_LAZY14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F->mod, F->mod2, F->tab_w); + DFT8_NODE0_LAZY14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, Fargs->mod, Fargs->mod2, Fargs->tab_w); else if (depth == 4) - DFT16_NODE0_LAZY14(p, F->mod, F->mod2, F->tab_w); + DFT16_NODE0_LAZY14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, + p+8, p+9, p+10, p+11, p+12, p+13, p+14, p+15, + Fargs->mod, Fargs->mod2, Fargs->tab_w); else if (depth == 5) - DFT32_NODE0_LAZY14(p, F->mod, F->mod2, F->tab_w); + DFT32_NODE0_LAZY14(p, Fargs->mod, Fargs->mod2, Fargs->tab_w); else { const ulong len = UWORD(1) << depth; @@ -773,16 +779,15 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) ulong p_hi, p_lo; for (ulong k = 0; k < len/4; k++) { - DFT4_NODE0_LAZY14(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - if (p0[k] >= F->mod2) - p0[k] -= F->mod2; + DFT4_NODE0_LAZY14(p0[k], p1[k], p2[k], p3[k], Fargs->tab_w[2], Fargs->tab_w[3], Fargs->mod, Fargs->mod2, p_hi, p_lo); + if (p0[k] >= Fargs->mod2) + p0[k] -= Fargs->mod2; } // 4 recursive calls with depth-2 - dft_node0_lazy24(p0, depth-2, F); - n_fft_params_t Fpar = {{F->mod, F->mod2, F->tab_w}}; - dft_lazy44(p1, depth-2, 1, Fpar); - dft_lazy44(p2, depth-2, 2, Fpar); - dft_lazy44(p3, depth-2, 3, Fpar); + dft_node0_lazy24(p0, depth-2, Fargs); + dft_lazy44(p1, depth-2, 1, Fargs); + dft_lazy44(p2, depth-2, 2, Fargs); + dft_lazy44(p3, depth-2, 3, Fargs); } } diff --git a/src/n_fft/test/t-init.c b/src/n_fft/test/t-init.c index 2012dc69c1..de7038a3bc 100644 --- a/src/n_fft/test/t-init.c +++ b/src/n_fft/test/t-init.c @@ -38,9 +38,6 @@ int test_one(n_fft_ctx_t F, ulong max_depth, ulong depth, ulong p, flint_rand_t if (F->mod != p) return 1; - if (F->mod2 != 2*p) - return 2; - //if (F->mod4 != 4*p) // return 3; From 5b842601dd0b125355239c810c12874bcd3ff1bb Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 11:37:23 +0100 Subject: [PATCH 37/71] dft 16 macro pointers --- src/n_fft/dft.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index bc5a34a002..57eadf9d4d 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -311,43 +311,43 @@ do { \ do { \ ulong p_hi, p_lo, tmp; \ \ - DFT4_NODE0_LAZY14((p0), (p4), (p8), (p12), \ + DFT4_NODE0_LAZY14(*(p0), *(p4), *(p8), *(p12), \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if ((p0) >= mod2) \ - (p0) -= mod2; \ - DFT4_NODE0_LAZY14((p1), (p5), (p9), (p13), \ + if (*(p0) >= mod2) \ + *(p0) -= mod2; \ + DFT4_NODE0_LAZY14(*(p1), *(p5), *(p9), *(p13), \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if ((p1) >= mod2) \ - (p1) -= mod2; \ - DFT4_NODE0_LAZY14((p2), (p6), (p10), (p14), \ + if (*(p1) >= mod2) \ + *(p1) -= mod2; \ + DFT4_NODE0_LAZY14(*(p2), *(p6), *(p10), *(p14), \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if ((p2) >= mod2) \ - (p2) -= mod2; \ - DFT4_NODE0_LAZY14((p3), (p7), (p11), (p15), \ + if (*(p2) >= mod2) \ + *(p2) -= mod2; \ + DFT4_NODE0_LAZY14(*(p3), *(p7), *(p11), *(p15), \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if ((p3) >= mod2) \ - (p3) -= mod2; \ + if (*(p3) >= mod2) \ + *(p3) -= mod2; \ \ /* next line requires < 2n, */ \ /* hence the four reductions above */ \ - DFT4_NODE0_LAZY24((p0), (p1), (p2), (p3), \ + DFT4_NODE0_LAZY24(*(p0), *(p1), *(p2), *(p3), \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - DFT4_LAZY44((p4), (p5), (p6), (p7), \ + DFT4_LAZY44(*(p4), *(p5), *(p6), *(p7), \ tab_w[2], tab_w[3], \ tab_w[4], tab_w[5], \ tab_w[6], tab_w[7], \ mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44((p8), (p9), (p10), (p11), \ + DFT4_LAZY44(*(p8), *(p9), *(p10), *(p11), \ tab_w[4], tab_w[5], \ tab_w[8], tab_w[9], \ tab_w[10], tab_w[11], \ mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44((p12), (p13), (p14), (p15), \ + DFT4_LAZY44(*(p12), *(p13), *(p14), *(p15), \ tab_w[6], tab_w[7], \ tab_w[12], tab_w[13], \ tab_w[14], tab_w[15], \ @@ -762,7 +762,7 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) else if (depth == 4) DFT16_NODE0_LAZY14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, p+8, p+9, p+10, p+11, p+12, p+13, p+14, p+15, - Fargs->mod, Fargs->mod2, Fargs->tab_w); + Fargs->mod, Fargs->mod2, Fargs->tab_w); else if (depth == 5) DFT32_NODE0_LAZY14(p, Fargs->mod, Fargs->mod2, Fargs->tab_w); else From 1fb2479cf041893ac7e640ad2e6b14947c4c1791 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 11:54:13 +0100 Subject: [PATCH 38/71] dft 16 macro pointers --- src/n_fft/dft.c | 98 ++++++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 46 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 57eadf9d4d..bbcd1d586d 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -358,51 +358,53 @@ do { \ * * in [0..2n) / out [0..4n) / max < 4n * * Same specification as dft_node0_lazy24, for depth==4 */ -#define DFT16_NODE0_LAZY24(p, mod, mod2, tab_w) \ -do { \ - ulong p_hi, p_lo, tmp; \ - \ - DFT4_NODE0_LAZY24(p[0], p[4], p[ 8], p[12], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[0] >= mod2) \ - p[0] -= mod2; \ - DFT4_NODE0_LAZY24(p[1], p[5], p[ 9], p[13], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[1] >= mod2) \ - p[1] -= mod2; \ - DFT4_NODE0_LAZY24(p[2], p[6], p[10], p[14], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[2] >= mod2) \ - p[2] -= mod2; \ - DFT4_NODE0_LAZY24(p[3], p[7], p[11], p[15], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[3] >= mod2) \ - p[3] -= mod2; \ - \ - /* next line requires < 2n, */ \ - /* hence the four reductions above */ \ - DFT4_NODE0_LAZY24(p[0], p[1], p[2], p[3], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - DFT4_LAZY44(p[4], p[5], p[6], p[7], \ - tab_w[2], tab_w[3], \ - tab_w[4], tab_w[5], \ - tab_w[6], tab_w[7], \ - mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[8], p[9], p[10], p[11], \ - tab_w[4], tab_w[5], \ - tab_w[8], tab_w[9], \ - tab_w[10], tab_w[11], \ - mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[12], p[13], p[14], p[15], \ - tab_w[6], tab_w[7], \ - tab_w[12], tab_w[13], \ - tab_w[14], tab_w[15], \ - mod, mod2, p_hi, p_lo, tmp); \ +#define DFT16_NODE0_LAZY24(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + \ + DFT4_NODE0_LAZY24(*(p0), *(p4), *(p8), *(p12), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (*(p0) >= mod2) \ + *(p0) -= mod2; \ + DFT4_NODE0_LAZY24(*(p1), *(p5), *(p9), *(p13), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (*(p1) >= mod2) \ + *(p1) -= mod2; \ + DFT4_NODE0_LAZY24(*(p2), *(p6), *(p10), *(p14), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (*(p2) >= mod2) \ + *(p2) -= mod2; \ + DFT4_NODE0_LAZY24(*(p3), *(p7), *(p11), *(p15), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (*(p3) >= mod2) \ + *(p3) -= mod2; \ + \ + /* next line requires < 2n, */ \ + /* hence the four reductions above */ \ + DFT4_NODE0_LAZY24(*(p0), *(p1), *(p2), *(p3), \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + DFT4_LAZY44(*(p4), *(p5), *(p6), *(p7), \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(*(p8), *(p9), *(p10), *(p11), \ + tab_w[4], tab_w[5], \ + tab_w[8], tab_w[9], \ + tab_w[10], tab_w[11], \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(*(p12), *(p13), *(p14), *(p15), \ + tab_w[6], tab_w[7], \ + tab_w[12], tab_w[13], \ + tab_w[14], tab_w[15], \ + mod, mod2, p_hi, p_lo, tmp); \ } while(0) /** 16-point DFT @@ -478,6 +480,8 @@ do { \ /*--------------*/ /* 32-point DFT */ /*--------------*/ +//p16, p17, p18, p19, p20, p21, p22, p23, \ +//p24, p25, p26, p27, p28, p29, p30, p31, \ /** 32-point DFT, node 0 * * in [0..n) / out [0..4n) / max < 4n @@ -697,7 +701,9 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_args_t F) if (depth == 3) DFT8_NODE0_LAZY24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F->mod, F->mod2, F->tab_w); else if (depth == 4) - DFT16_NODE0_LAZY24(p, F->mod, F->mod2, F->tab_w); + DFT16_NODE0_LAZY24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, + p+8, p+9, p+10, p+11, p+12, p+13, p+14, p+15, + F->mod, F->mod2, F->tab_w); else if (depth == 5) DFT32_NODE0_LAZY24(p, F->mod, F->mod2, F->tab_w); else From 4d8071fb68eafa4ee88a531cd27c78ac953433ae Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 12:01:24 +0100 Subject: [PATCH 39/71] dft8 simplify a bit --- src/n_fft/dft.c | 66 ++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index bbcd1d586d..c13b58c052 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -222,17 +222,17 @@ do { \ do { \ ulong p_hi, p_lo, tmp; \ \ - DFT2_NODE0_LAZY12(*(p0), *(p4), mod, tmp); \ - DFT2_NODE0_LAZY12(*(p1), *(p5), mod, tmp); \ - DFT2_NODE0_LAZY12(*(p2), *(p6), mod, tmp); \ - DFT2_NODE0_LAZY12(*(p3), *(p7), mod, tmp); \ + DFT2_NODE0_LAZY12(p0, p4, mod, tmp); \ + DFT2_NODE0_LAZY12(p1, p5, mod, tmp); \ + DFT2_NODE0_LAZY12(p2, p6, mod, tmp); \ + DFT2_NODE0_LAZY12(p3, p7, mod, tmp); \ \ - DFT4_NODE0_LAZY24(*(p0), *(p1), *(p2), *(p3), \ + DFT4_NODE0_LAZY24(p0, p1, p2, p3, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ /* could use a lazy24 variant of the next macro, */ \ /* but the gain is negligible */ \ - DFT4_LAZY44(*(p4), *(p5), *(p6), *(p7), \ + DFT4_LAZY44(p4, p5, p6, p7, \ tab_w[2], tab_w[3], \ tab_w[4], tab_w[5], \ tab_w[6], tab_w[7], \ @@ -248,15 +248,15 @@ do { \ do { \ ulong p_hi, p_lo, tmp; \ \ - DFT2_NODE0_LAZY24(*(p0), *(p4), mod2, tmp); \ - DFT2_NODE0_LAZY24(*(p1), *(p5), mod2, tmp); \ - DFT2_NODE0_LAZY24(*(p2), *(p6), mod2, tmp); \ - DFT2_NODE0_LAZY24(*(p3), *(p7), mod2, tmp); \ + DFT2_NODE0_LAZY24(p0, p4, mod2, tmp); \ + DFT2_NODE0_LAZY24(p1, p5, mod2, tmp); \ + DFT2_NODE0_LAZY24(p2, p6, mod2, tmp); \ + DFT2_NODE0_LAZY24(p3, p7, mod2, tmp); \ \ - DFT4_NODE0_LAZY24(*(p0), *(p1), *(p2), *(p3), \ + DFT4_NODE0_LAZY24(p0, p1, p2, p3, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - DFT4_LAZY44(*(p4), *(p5), *(p6), *(p7), \ + DFT4_LAZY44(p4, p5, p6, p7, \ tab_w[2], tab_w[3], \ tab_w[4], tab_w[5], \ tab_w[6], tab_w[7], \ @@ -279,18 +279,18 @@ do { \ \ const ulong w = tab_w[2*(node)]; \ const ulong w_pr = tab_w[2*(node)+1]; \ - DFT2_LAZY44(*(p0), *(p4), mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ - DFT2_LAZY44(*(p1), *(p5), mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ - DFT2_LAZY44(*(p2), *(p6), mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ - DFT2_LAZY44(*(p3), *(p7), mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + DFT2_LAZY44(p0, p4, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + DFT2_LAZY44(p1, p5, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + DFT2_LAZY44(p2, p6, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + DFT2_LAZY44(p3, p7, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ \ - DFT4_LAZY44(*(p0), *(p1), *(p2), *(p3), \ + DFT4_LAZY44(p0, p1, p2, p3, \ tab_w[4*(node)], tab_w[4*(node)+1], \ tab_w[8*(node)], tab_w[8*(node)+1], \ tab_w[8*(node)+2], tab_w[8*(node)+3], \ mod, mod2, p_hi, p_lo, u); \ \ - DFT4_LAZY44(*(p4), *(p5), *(p6), *(p7), \ + DFT4_LAZY44(p4, p5, p6, p7, \ tab_w[4*(node)+2], tab_w[4*(node)+3], \ tab_w[8*(node)+4], tab_w[8*(node)+5], \ tab_w[8*(node)+6], tab_w[8*(node)+7], \ @@ -533,10 +533,10 @@ do { p[7] -= mod2; \ \ /* next line requires < 2n, hence the 8 reductions above */ \ - DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, mod, mod2, tab_w); \ - DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, mod, mod2, tab_w); \ - DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, mod, mod2, tab_w); \ - DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, mod, mod2, tab_w); \ + DFT8_NODE0_LAZY24(p[ 0], p[ 1], p[ 2], p[ 3], p[ 4], p[ 5], p[ 6], p[ 7], mod, mod2, tab_w); \ + DFT8_LAZY44( p[ 8], p[ 9], p[10], p[11], p[12], p[13], p[14], p[15], 1, mod, mod2, tab_w); \ + DFT8_LAZY44( p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], 2, mod, mod2, tab_w); \ + DFT8_LAZY44( p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], 3, mod, mod2, tab_w); \ } while(0) /** 32-point DFT, node 0 @@ -589,10 +589,10 @@ do { p[7] -= mod2; \ \ /* next line requires < 2n, hence the 8 reductions above */ \ - DFT8_NODE0_LAZY24(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, mod, mod2, tab_w); \ - DFT8_LAZY44( p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 1, mod, mod2, tab_w); \ - DFT8_LAZY44( p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 2, mod, mod2, tab_w); \ - DFT8_LAZY44( p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 3, mod, mod2, tab_w); \ + DFT8_NODE0_LAZY24(p[ 0], p[ 1], p[ 2], p[ 3], p[ 4], p[ 5], p[ 6], p[ 7], mod, mod2, tab_w); \ + DFT8_LAZY44( p[ 8], p[ 9], p[10], p[11], p[12], p[13], p[14], p[15], 1, mod, mod2, tab_w); \ + DFT8_LAZY44( p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], 2, mod, mod2, tab_w); \ + DFT8_LAZY44( p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], 3, mod, mod2, tab_w); \ } while(0) /** 32-point DFT @@ -619,10 +619,10 @@ do { DFT4_LAZY44(p[7], p[15], p[23], p[31], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ \ /* next line requires < 2n, hence the four reductions above */ \ - DFT8_LAZY44(p+ 0, p+ 1, p+ 2, p+ 3, p+ 4, p+ 5, p+ 6, p+ 7, 4*node, mod, mod2, tab_w); \ - DFT8_LAZY44(p+ 8, p+ 9, p+10, p+11, p+12, p+13, p+14, p+15, 4*node+1, mod, mod2, tab_w); \ - DFT8_LAZY44(p+16, p+17, p+18, p+19, p+20, p+21, p+22, p+23, 4*node+2, mod, mod2, tab_w); \ - DFT8_LAZY44(p+24, p+25, p+26, p+27, p+28, p+29, p+30, p+31, 4*node+3, mod, mod2, tab_w); \ + DFT8_LAZY44(p[ 0], p[ 1], p[ 2], p[ 3], p[ 4], p[ 5], p[ 6], p[ 7], 4*node, mod, mod2, tab_w); \ + DFT8_LAZY44(p[ 8], p[ 9], p[10], p[11], p[12], p[13], p[14], p[15], 4*node+1, mod, mod2, tab_w); \ + DFT8_LAZY44(p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], 4*node+2, mod, mod2, tab_w); \ + DFT8_LAZY44(p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], 4*node+3, mod, mod2, tab_w); \ } while(0) /*-------------*/ @@ -645,7 +645,7 @@ do { void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) { if (depth == 3) - DFT8_LAZY44(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, node, F->mod, F->mod2, F->tab_w); + DFT8_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], node, F->mod, F->mod2, F->tab_w); else if (depth == 4) DFT16_LAZY44(p, node, F->mod, F->mod2, F->tab_w); else if (depth == 5) @@ -699,7 +699,7 @@ void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_args_t F) { if (depth == 3) - DFT8_NODE0_LAZY24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, F->mod, F->mod2, F->tab_w); + DFT8_NODE0_LAZY24(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w); else if (depth == 4) DFT16_NODE0_LAZY24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, p+8, p+9, p+10, p+11, p+12, p+13, p+14, p+15, @@ -764,7 +764,7 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) DFT4_NODE0_LAZY14(p[0], p[1], p[2], p[3], Fargs->tab_w[2], Fargs->tab_w[3], Fargs->mod, Fargs->mod2, p_hi, p_lo); } else if (depth == 3) - DFT8_NODE0_LAZY14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, Fargs->mod, Fargs->mod2, Fargs->tab_w); + DFT8_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], Fargs->mod, Fargs->mod2, Fargs->tab_w); else if (depth == 4) DFT16_NODE0_LAZY14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, p+8, p+9, p+10, p+11, p+12, p+13, p+14, p+15, From 6e751ea1400e5da8253f7b5f10791b4461f83a9f Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 12:01:47 +0100 Subject: [PATCH 40/71] clean --- src/n_fft/dft.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index c13b58c052..c4650e98e9 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -272,29 +272,29 @@ do { \ * * By construction these 8 evaluation points are the 8 roots of the * polynomial x**8 - F->tab_w[node] */ -#define DFT8_LAZY44(p0, p1, p2, p3, p4, p5, p6, p7, \ - node, mod, mod2, tab_w) \ -do { \ - ulong p_hi, p_lo, u, v; \ - \ - const ulong w = tab_w[2*(node)]; \ - const ulong w_pr = tab_w[2*(node)+1]; \ - DFT2_LAZY44(p0, p4, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ - DFT2_LAZY44(p1, p5, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ - DFT2_LAZY44(p2, p6, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ - DFT2_LAZY44(p3, p7, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ - \ - DFT4_LAZY44(p0, p1, p2, p3, \ - tab_w[4*(node)], tab_w[4*(node)+1], \ - tab_w[8*(node)], tab_w[8*(node)+1], \ - tab_w[8*(node)+2], tab_w[8*(node)+3], \ - mod, mod2, p_hi, p_lo, u); \ - \ - DFT4_LAZY44(p4, p5, p6, p7, \ - tab_w[4*(node)+2], tab_w[4*(node)+3], \ - tab_w[8*(node)+4], tab_w[8*(node)+5], \ - tab_w[8*(node)+6], tab_w[8*(node)+7], \ - mod, mod2, p_hi, p_lo, u); \ +#define DFT8_LAZY44(p0, p1, p2, p3, p4, p5, p6, p7, \ + node, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, u, v; \ + \ + const ulong w = tab_w[2*(node)]; \ + const ulong w_pr = tab_w[2*(node)+1]; \ + DFT2_LAZY44(p0, p4, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + DFT2_LAZY44(p1, p5, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + DFT2_LAZY44(p2, p6, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + DFT2_LAZY44(p3, p7, mod, mod2, w, w_pr, p_hi, p_lo, u, v); \ + \ + DFT4_LAZY44(p0, p1, p2, p3, \ + tab_w[4*(node)], tab_w[4*(node)+1], \ + tab_w[8*(node)], tab_w[8*(node)+1], \ + tab_w[8*(node)+2], tab_w[8*(node)+3], \ + mod, mod2, p_hi, p_lo, u); \ + \ + DFT4_LAZY44(p4, p5, p6, p7, \ + tab_w[4*(node)+2], tab_w[4*(node)+3], \ + tab_w[8*(node)+4], tab_w[8*(node)+5], \ + tab_w[8*(node)+6], tab_w[8*(node)+7], \ + mod, mod2, p_hi, p_lo, u); \ } while(0) /*--------------*/ From 192a575d01d3aa2ac29343c28fff7cb5e85d5e2c Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 12:04:40 +0100 Subject: [PATCH 41/71] dft16 simplify a bit --- src/n_fft/dft.c | 72 ++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index c4650e98e9..12382ece4e 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -311,43 +311,43 @@ do { \ do { \ ulong p_hi, p_lo, tmp; \ \ - DFT4_NODE0_LAZY14(*(p0), *(p4), *(p8), *(p12), \ + DFT4_NODE0_LAZY14(p0, p4, p8, p12, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if (*(p0) >= mod2) \ - *(p0) -= mod2; \ - DFT4_NODE0_LAZY14(*(p1), *(p5), *(p9), *(p13), \ + if (p0 >= mod2) \ + p0 -= mod2; \ + DFT4_NODE0_LAZY14(p1, p5, p9, p13, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if (*(p1) >= mod2) \ - *(p1) -= mod2; \ - DFT4_NODE0_LAZY14(*(p2), *(p6), *(p10), *(p14), \ + if (p1 >= mod2) \ + p1 -= mod2; \ + DFT4_NODE0_LAZY14(p2, p6, p10, p14, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if (*(p2) >= mod2) \ - *(p2) -= mod2; \ - DFT4_NODE0_LAZY14(*(p3), *(p7), *(p11), *(p15), \ + if (p2 >= mod2) \ + p2 -= mod2; \ + DFT4_NODE0_LAZY14(p3, p7, p11, p15, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if (*(p3) >= mod2) \ - *(p3) -= mod2; \ + if (p3 >= mod2) \ + p3 -= mod2; \ \ /* next line requires < 2n, */ \ /* hence the four reductions above */ \ - DFT4_NODE0_LAZY24(*(p0), *(p1), *(p2), *(p3), \ + DFT4_NODE0_LAZY24(p0, p1, p2, p3, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - DFT4_LAZY44(*(p4), *(p5), *(p6), *(p7), \ + DFT4_LAZY44(p4, p5, p6, p7, \ tab_w[2], tab_w[3], \ tab_w[4], tab_w[5], \ tab_w[6], tab_w[7], \ mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(*(p8), *(p9), *(p10), *(p11), \ + DFT4_LAZY44(p8, p9, p10, p11, \ tab_w[4], tab_w[5], \ tab_w[8], tab_w[9], \ tab_w[10], tab_w[11], \ mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(*(p12), *(p13), *(p14), *(p15), \ + DFT4_LAZY44(p12, p13, p14, p15, \ tab_w[6], tab_w[7], \ tab_w[12], tab_w[13], \ tab_w[14], tab_w[15], \ @@ -364,43 +364,43 @@ do { \ do { \ ulong p_hi, p_lo, tmp; \ \ - DFT4_NODE0_LAZY24(*(p0), *(p4), *(p8), *(p12), \ + DFT4_NODE0_LAZY24(p0, p4, p8, p12, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if (*(p0) >= mod2) \ - *(p0) -= mod2; \ - DFT4_NODE0_LAZY24(*(p1), *(p5), *(p9), *(p13), \ + if (p0 >= mod2) \ + p0 -= mod2; \ + DFT4_NODE0_LAZY24(p1, p5, p9, p13, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if (*(p1) >= mod2) \ - *(p1) -= mod2; \ - DFT4_NODE0_LAZY24(*(p2), *(p6), *(p10), *(p14), \ + if (p1 >= mod2) \ + p1 -= mod2; \ + DFT4_NODE0_LAZY24(p2, p6, p10, p14, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if (*(p2) >= mod2) \ - *(p2) -= mod2; \ - DFT4_NODE0_LAZY24(*(p3), *(p7), *(p11), *(p15), \ + if (p2 >= mod2) \ + p2 -= mod2; \ + DFT4_NODE0_LAZY24(p3, p7, p11, p15, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - if (*(p3) >= mod2) \ - *(p3) -= mod2; \ + if (p3 >= mod2) \ + p3 -= mod2; \ \ /* next line requires < 2n, */ \ /* hence the four reductions above */ \ - DFT4_NODE0_LAZY24(*(p0), *(p1), *(p2), *(p3), \ + DFT4_NODE0_LAZY24(p0, p1, p2, p3, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - DFT4_LAZY44(*(p4), *(p5), *(p6), *(p7), \ + DFT4_LAZY44(p4, p5, p6, p7, \ tab_w[2], tab_w[3], \ tab_w[4], tab_w[5], \ tab_w[6], tab_w[7], \ mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(*(p8), *(p9), *(p10), *(p11), \ + DFT4_LAZY44(p8, p9, p10, p11, \ tab_w[4], tab_w[5], \ tab_w[8], tab_w[9], \ tab_w[10], tab_w[11], \ mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(*(p12), *(p13), *(p14), *(p15), \ + DFT4_LAZY44(p12, p13, p14, p15, \ tab_w[6], tab_w[7], \ tab_w[12], tab_w[13], \ tab_w[14], tab_w[15], \ @@ -701,8 +701,8 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_args_t F) if (depth == 3) DFT8_NODE0_LAZY24(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w); else if (depth == 4) - DFT16_NODE0_LAZY24(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, - p+8, p+9, p+10, p+11, p+12, p+13, p+14, p+15, + DFT16_NODE0_LAZY24(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], F->mod, F->mod2, F->tab_w); else if (depth == 5) DFT32_NODE0_LAZY24(p, F->mod, F->mod2, F->tab_w); @@ -766,8 +766,8 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) else if (depth == 3) DFT8_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], Fargs->mod, Fargs->mod2, Fargs->tab_w); else if (depth == 4) - DFT16_NODE0_LAZY14(p+0, p+1, p+2, p+3, p+4, p+5, p+6, p+7, - p+8, p+9, p+10, p+11, p+12, p+13, p+14, p+15, + DFT16_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], Fargs->mod, Fargs->mod2, Fargs->tab_w); else if (depth == 5) DFT32_NODE0_LAZY14(p, Fargs->mod, Fargs->mod2, Fargs->tab_w); From 375a84d64e0374a1f3a428abbd17f916020f8923 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 12:09:01 +0100 Subject: [PATCH 42/71] dft16 4 4 now ok as well --- src/n_fft/dft.c | 134 +++++++++++++++++++++++++----------------------- 1 file changed, 69 insertions(+), 65 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 12382ece4e..f8c173a2cc 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -411,70 +411,72 @@ do { \ * * in [0..4n) / out [0..4n) / max < 4n * * Same specification as dft_lazy44, for depth==4 */ -#define DFT16_LAZY44(p, node, mod, mod2, tab_w) \ -do { \ - ulong p_hi, p_lo, tmp; \ - ulong w2, w2pre, w, wpre, Iw, Iwpre; \ - \ - w2 = tab_w[2*node]; \ - w2pre = tab_w[2*node+1]; \ - w = tab_w[4*node]; \ - wpre = tab_w[4*node+1]; \ - Iw = tab_w[4*node+2]; \ - Iwpre = tab_w[4*node+3]; \ - \ - DFT4_LAZY44(p[0], p[4], p[ 8], p[12], \ - w2, w2pre, w, wpre, Iw, Iwpre, \ - mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[1], p[5], p[ 9], p[13], \ - w2, w2pre, w, wpre, Iw, Iwpre, \ - mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[2], p[6], p[10], p[14], \ - w2, w2pre, w, wpre, Iw, Iwpre, \ - mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[3], p[7], p[11], p[15], \ - w2, w2pre, w, wpre, Iw, Iwpre, \ - mod, mod2, p_hi, p_lo, tmp); \ - \ - w2 = tab_w[8*node]; \ - w2pre = tab_w[8*node+1]; \ - w = tab_w[16*node]; \ - wpre = tab_w[16*node+1]; \ - Iw = tab_w[16*node+2]; \ - Iwpre = tab_w[16*node+3]; \ - DFT4_LAZY44(p[0], p[1], p[2], p[3], \ - w2, w2pre, w, wpre, Iw, Iwpre, \ - mod, mod2, p_hi, p_lo, tmp); \ - \ - w2 = tab_w[8*node+2]; \ - w2pre = tab_w[8*node+3]; \ - w = tab_w[16*node+4]; \ - wpre = tab_w[16*node+5]; \ - Iw = tab_w[16*node+6]; \ - Iwpre = tab_w[16*node+7]; \ - DFT4_LAZY44(p[4], p[5], p[6], p[7], \ - w2, w2pre, w, wpre, Iw, Iwpre, \ - mod, mod2, p_hi, p_lo, tmp); \ - \ - w2 = tab_w[8*node+4]; \ - w2pre = tab_w[8*node+5]; \ - w = tab_w[16*node+8]; \ - wpre = tab_w[16*node+9]; \ - Iw = tab_w[16*node+10]; \ - Iwpre = tab_w[16*node+11]; \ - DFT4_LAZY44(p[8], p[9], p[10], p[11], \ - w2, w2pre, w, wpre, Iw, Iwpre, \ - mod, mod2, p_hi, p_lo, tmp); \ - \ - w2 = tab_w[8*node+6]; \ - w2pre = tab_w[8*node+7]; \ - w = tab_w[16*node+12]; \ - wpre = tab_w[16*node+13]; \ - Iw = tab_w[16*node+14]; \ - Iwpre = tab_w[16*node+15]; \ - DFT4_LAZY44(p[12], p[13], p[14], p[15], \ - w2, w2pre, w, wpre, Iw, Iwpre, \ - mod, mod2, p_hi, p_lo, tmp); \ +#define DFT16_LAZY44(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + node, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + ulong w2, w2pre, w, wpre, Iw, Iwpre; \ + \ + w2 = tab_w[2*node]; \ + w2pre = tab_w[2*node+1]; \ + w = tab_w[4*node]; \ + wpre = tab_w[4*node+1]; \ + Iw = tab_w[4*node+2]; \ + Iwpre = tab_w[4*node+3]; \ + \ + DFT4_LAZY44(p0, p4, p8, p12, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p1, p5, p9, p13, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p2, p6, p10, p14, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p3, p7, p11, p15, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + \ + w2 = tab_w[8*node]; \ + w2pre = tab_w[8*node+1]; \ + w = tab_w[16*node]; \ + wpre = tab_w[16*node+1]; \ + Iw = tab_w[16*node+2]; \ + Iwpre = tab_w[16*node+3]; \ + DFT4_LAZY44(p0, p1, p2, p3, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + \ + w2 = tab_w[8*node+2]; \ + w2pre = tab_w[8*node+3]; \ + w = tab_w[16*node+4]; \ + wpre = tab_w[16*node+5]; \ + Iw = tab_w[16*node+6]; \ + Iwpre = tab_w[16*node+7]; \ + DFT4_LAZY44(p4, p5, p6, p7, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + \ + w2 = tab_w[8*node+4]; \ + w2pre = tab_w[8*node+5]; \ + w = tab_w[16*node+8]; \ + wpre = tab_w[16*node+9]; \ + Iw = tab_w[16*node+10]; \ + Iwpre = tab_w[16*node+11]; \ + DFT4_LAZY44(p8, p9, p10, p11, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ + \ + w2 = tab_w[8*node+6]; \ + w2pre = tab_w[8*node+7]; \ + w = tab_w[16*node+12]; \ + wpre = tab_w[16*node+13]; \ + Iw = tab_w[16*node+14]; \ + Iwpre = tab_w[16*node+15]; \ + DFT4_LAZY44(p12, p13, p14, p15, \ + w2, w2pre, w, wpre, Iw, Iwpre, \ + mod, mod2, p_hi, p_lo, tmp); \ } while(0) /*--------------*/ @@ -647,7 +649,9 @@ void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) if (depth == 3) DFT8_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], node, F->mod, F->mod2, F->tab_w); else if (depth == 4) - DFT16_LAZY44(p, node, F->mod, F->mod2, F->tab_w); + DFT16_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + node, F->mod, F->mod2, F->tab_w); else if (depth == 5) DFT32_LAZY44(p, node, F->mod, F->mod2, F->tab_w); else From c9b6575f1d15bc50f0bebdf44c878901cdb9717c Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 12:56:06 +0100 Subject: [PATCH 43/71] dft32 expanded as well; this is ugly but will help for versions with strides --- src/n_fft/dft.c | 292 ++++++++++++++++++++++++++---------------------- 1 file changed, 157 insertions(+), 135 deletions(-) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index f8c173a2cc..ecbc57de99 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -211,7 +211,7 @@ do { \ /** 8-point DFT, node 0 * * in [0..n) / out [0..4n) / max < 4n * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial - * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations + * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations * p(1), p(-1), p(I), p(-I), p(J), p(-J), p(I*J), p(-I*J) * i.e. the evaluations at all 8-th roots of unity J**k for 0 <= k < 8 in * bit-reversed order @@ -266,7 +266,7 @@ do { \ /** 8-point DFT * * in [0..4n) / out [0..4n) / max < 4n * * In-place transform p = [p0,p1,p2,p3,p4,p5,p6,p7], seen as a polynomial - * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations + * p(x) = p0 + p1*x + ... + p7*x**7 into its evaluations * p(w0), p(-w0), p(w1), p(-w1), p(w2), p(-w2), p(w3), p(-w3) * where w_k = F->tab_w[8*node + 2*k] for 0 <= k < 4 * * By construction these 8 evaluation points are the 8 roots of the @@ -482,149 +482,159 @@ do { \ /*--------------*/ /* 32-point DFT */ /*--------------*/ -//p16, p17, p18, p19, p20, p21, p22, p23, \ -//p24, p25, p26, p27, p28, p29, p30, p31, \ /** 32-point DFT, node 0 * * in [0..n) / out [0..4n) / max < 4n * * Apart from this range, same specification as dft_node0_lazy24, for depth==5 */ -#define DFT32_NODE0_LAZY14(p, mod, mod2, tab_w) \ -do { \ - ulong p_hi, p_lo; \ - \ - DFT4_NODE0_LAZY14(p[0], p[8 ], p[16], p[24], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[0] >= mod2) \ - p[0] -= mod2; \ - DFT4_NODE0_LAZY14(p[1], p[9 ], p[17], p[25], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[1] >= mod2) \ - p[1] -= mod2; \ - DFT4_NODE0_LAZY14(p[2], p[10], p[18], p[26], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[2] >= mod2) \ - p[2] -= mod2; \ - DFT4_NODE0_LAZY14(p[3], p[11], p[19], p[27], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[3] >= mod2) \ - p[3] -= mod2; \ - DFT4_NODE0_LAZY14(p[4], p[12], p[20], p[28], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[4] >= mod2) \ - p[4] -= mod2; \ - DFT4_NODE0_LAZY14(p[5], p[13], p[21], p[29], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[5] >= mod2) \ - p[5] -= mod2; \ - DFT4_NODE0_LAZY14(p[6], p[14], p[22], p[30], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[6] >= mod2) \ - p[6] -= mod2; \ - DFT4_NODE0_LAZY14(p[7], p[15], p[23], p[31], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[7] >= mod2) \ - p[7] -= mod2; \ - \ - /* next line requires < 2n, hence the 8 reductions above */ \ - DFT8_NODE0_LAZY24(p[ 0], p[ 1], p[ 2], p[ 3], p[ 4], p[ 5], p[ 6], p[ 7], mod, mod2, tab_w); \ - DFT8_LAZY44( p[ 8], p[ 9], p[10], p[11], p[12], p[13], p[14], p[15], 1, mod, mod2, tab_w); \ - DFT8_LAZY44( p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], 2, mod, mod2, tab_w); \ - DFT8_LAZY44( p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], 3, mod, mod2, tab_w); \ +#define DFT32_NODE0_LAZY14(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + p16, p17, p18, p19, p20, p21, p22, p23, \ + p24, p25, p26, p27, p28, p29, p30, p31, \ + mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo; \ + \ + DFT4_NODE0_LAZY14(p0, p8, p16, p24, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p0 >= mod2) \ + p0 -= mod2; \ + DFT4_NODE0_LAZY14(p1, p9, p17, p25, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p1 >= mod2) \ + p1 -= mod2; \ + DFT4_NODE0_LAZY14(p2, p10, p18, p26, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p2 >= mod2) \ + p2 -= mod2; \ + DFT4_NODE0_LAZY14(p3, p11, p19, p27, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p3 >= mod2) \ + p3 -= mod2; \ + DFT4_NODE0_LAZY14(p4, p12, p20, p28, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p4 >= mod2) \ + p4 -= mod2; \ + DFT4_NODE0_LAZY14(p5, p13, p21, p29, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p5 >= mod2) \ + p5 -= mod2; \ + DFT4_NODE0_LAZY14(p6, p14, p22, p30, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p6 >= mod2) \ + p6 -= mod2; \ + DFT4_NODE0_LAZY14(p7, p15, p23, p31, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p7 >= mod2) \ + p7 -= mod2; \ + \ + /* next line requires < 2n, hence the 8 reductions above */ \ + DFT8_NODE0_LAZY24(p0, p1, p2, p3, p4, p5, p6, p7, mod, mod2, tab_w); \ + DFT8_LAZY44(p8, p9, p10, p11, p12, p13, p14, p15, 1, mod, mod2, tab_w); \ + DFT8_LAZY44(p16, p17, p18, p19, p20, p21, p22, p23, 2, mod, mod2, tab_w); \ + DFT8_LAZY44(p24, p25, p26, p27, p28, p29, p30, p31, 3, mod, mod2, tab_w); \ } while(0) /** 32-point DFT, node 0 * * in [0..2n) / out [0..4n) / max < 4n * * Same specification as dft_node0_lazy24, for depth==5 */ -#define DFT32_NODE0_LAZY24(p, mod, mod2, tab_w) \ -do { \ - ulong p_hi, p_lo; \ - \ - DFT4_NODE0_LAZY24(p[0], p[8 ], p[16], p[24], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[0] >= mod2) \ - p[0] -= mod2; \ - DFT4_NODE0_LAZY24(p[1], p[9 ], p[17], p[25], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[1] >= mod2) \ - p[1] -= mod2; \ - DFT4_NODE0_LAZY24(p[2], p[10], p[18], p[26], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[2] >= mod2) \ - p[2] -= mod2; \ - DFT4_NODE0_LAZY24(p[3], p[11], p[19], p[27], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[3] >= mod2) \ - p[3] -= mod2; \ - DFT4_NODE0_LAZY24(p[4], p[12], p[20], p[28], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[4] >= mod2) \ - p[4] -= mod2; \ - DFT4_NODE0_LAZY24(p[5], p[13], p[21], p[29], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[5] >= mod2) \ - p[5] -= mod2; \ - DFT4_NODE0_LAZY24(p[6], p[14], p[22], p[30], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[6] >= mod2) \ - p[6] -= mod2; \ - DFT4_NODE0_LAZY24(p[7], p[15], p[23], p[31], \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - if (p[7] >= mod2) \ - p[7] -= mod2; \ - \ - /* next line requires < 2n, hence the 8 reductions above */ \ - DFT8_NODE0_LAZY24(p[ 0], p[ 1], p[ 2], p[ 3], p[ 4], p[ 5], p[ 6], p[ 7], mod, mod2, tab_w); \ - DFT8_LAZY44( p[ 8], p[ 9], p[10], p[11], p[12], p[13], p[14], p[15], 1, mod, mod2, tab_w); \ - DFT8_LAZY44( p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], 2, mod, mod2, tab_w); \ - DFT8_LAZY44( p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], 3, mod, mod2, tab_w); \ +#define DFT32_NODE0_LAZY24(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + p16, p17, p18, p19, p20, p21, p22, p23, \ + p24, p25, p26, p27, p28, p29, p30, p31, \ + mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo; \ + \ + DFT4_NODE0_LAZY24(p0, p8, p16, p24, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p0 >= mod2) \ + p0 -= mod2; \ + DFT4_NODE0_LAZY24(p1, p9, p17, p25, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p1 >= mod2) \ + p1 -= mod2; \ + DFT4_NODE0_LAZY24(p2, p10, p18, p26, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p2 >= mod2) \ + p2 -= mod2; \ + DFT4_NODE0_LAZY24(p3, p11, p19, p27, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p3 >= mod2) \ + p3 -= mod2; \ + DFT4_NODE0_LAZY24(p4, p12, p20, p28, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p4 >= mod2) \ + p4 -= mod2; \ + DFT4_NODE0_LAZY24(p5, p13, p21, p29, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p5 >= mod2) \ + p5 -= mod2; \ + DFT4_NODE0_LAZY24(p6, p14, p22, p30, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p6 >= mod2) \ + p6 -= mod2; \ + DFT4_NODE0_LAZY24(p7, p15, p23, p31, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + if (p7 >= mod2) \ + p7 -= mod2; \ + \ + /* next line requires < 2n, hence the 8 reductions above */ \ + DFT8_NODE0_LAZY24(p0, p1, p2, p3, p4, p5, p6, p7, mod, mod2, tab_w); \ + DFT8_LAZY44(p8, p9, p10, p11, p12, p13, p14, p15, 1, mod, mod2, tab_w); \ + DFT8_LAZY44(p16, p17, p18, p19, p20, p21, p22, p23, 2, mod, mod2, tab_w); \ + DFT8_LAZY44(p24, p25, p26, p27, p28, p29, p30, p31, 3, mod, mod2, tab_w); \ } while(0) /** 32-point DFT * * in [0..4n) / out [0..4n) / max < 4n * * Same specification as dft_lazy44, for depth==5 */ -#define DFT32_LAZY44(p, node, mod, mod2, tab_w) \ -do { \ - ulong p_hi, p_lo, tmp; \ - \ - ulong w2 = tab_w[2*node]; \ - ulong w2pre = tab_w[2*node+1]; \ - ulong w = tab_w[4*node]; \ - ulong wpre = tab_w[4*node+1]; \ - ulong Iw = tab_w[4*node+2]; \ - ulong Iwpre = tab_w[4*node+3]; \ - DFT4_LAZY44(p[0], p[ 8], p[16], p[24], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[1], p[ 9], p[17], p[25], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[2], p[10], p[18], p[26], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[3], p[11], p[19], p[27], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[4], p[12], p[20], p[28], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[5], p[13], p[21], p[29], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[6], p[14], p[22], p[30], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ - DFT4_LAZY44(p[7], p[15], p[23], p[31], w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ - \ - /* next line requires < 2n, hence the four reductions above */ \ - DFT8_LAZY44(p[ 0], p[ 1], p[ 2], p[ 3], p[ 4], p[ 5], p[ 6], p[ 7], 4*node, mod, mod2, tab_w); \ - DFT8_LAZY44(p[ 8], p[ 9], p[10], p[11], p[12], p[13], p[14], p[15], 4*node+1, mod, mod2, tab_w); \ - DFT8_LAZY44(p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], 4*node+2, mod, mod2, tab_w); \ - DFT8_LAZY44(p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], 4*node+3, mod, mod2, tab_w); \ +#define DFT32_LAZY44(p0, p1, p2, p3, p4, p5, p6, p7, \ + p8, p9, p10, p11, p12, p13, p14, p15, \ + p16, p17, p18, p19, p20, p21, p22, p23, \ + p24, p25, p26, p27, p28, p29, p30, p31, \ + node, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + \ + ulong w2 = tab_w[2*node]; \ + ulong w2pre = tab_w[2*node+1]; \ + ulong w = tab_w[4*node]; \ + ulong wpre = tab_w[4*node+1]; \ + ulong Iw = tab_w[4*node+2]; \ + ulong Iwpre = tab_w[4*node+3]; \ + DFT4_LAZY44(p0, p8, p16, p24, w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p1, p9, p17, p25, w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p2, p10, p18, p26, w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p3, p11, p19, p27, w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p4, p12, p20, p28, w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p5, p13, p21, p29, w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p6, p14, p22, p30, w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + DFT4_LAZY44(p7, p15, p23, p31, w2, w2pre, w, wpre, Iw, Iwpre, mod, mod2, p_hi, p_lo, tmp); \ + \ + /* next line requires < 2n, hence the four reductions above */ \ + DFT8_LAZY44(p0, p1, p2, p3, p4, p5, p6, p7, 4*node, mod, mod2, tab_w); \ + DFT8_LAZY44(p8, p9, p10, p11, p12, p13, p14, p15, 4*node+1, mod, mod2, tab_w); \ + DFT8_LAZY44(p16, p17, p18, p19, p20, p21, p22, p23, 4*node+2, mod, mod2, tab_w); \ + DFT8_LAZY44(p24, p25, p26, p27, p28, p29, p30, p31, 4*node+3, mod, mod2, tab_w); \ } while(0) /*-------------*/ @@ -649,16 +659,20 @@ void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) if (depth == 3) DFT8_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], node, F->mod, F->mod2, F->tab_w); else if (depth == 4) - DFT16_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + DFT16_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], node, F->mod, F->mod2, F->tab_w); else if (depth == 5) - DFT32_LAZY44(p, node, F->mod, F->mod2, F->tab_w); + DFT32_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], + p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], + node, F->mod, F->mod2, F->tab_w); else { const ulong len = UWORD(1) << depth; - // 4-point butterflies + // 4-point butterflies // in: [0..4n), out: [0..4n) const nn_ptr p0 = p; const nn_ptr p1 = p+len/4; @@ -705,11 +719,15 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_args_t F) if (depth == 3) DFT8_NODE0_LAZY24(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w); else if (depth == 4) - DFT16_NODE0_LAZY24(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + DFT16_NODE0_LAZY24(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], F->mod, F->mod2, F->tab_w); else if (depth == 5) - DFT32_NODE0_LAZY24(p, F->mod, F->mod2, F->tab_w); + DFT32_NODE0_LAZY24(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], + p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], + F->mod, F->mod2, F->tab_w); else { const ulong len = UWORD(1) << depth; @@ -770,11 +788,15 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) else if (depth == 3) DFT8_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], Fargs->mod, Fargs->mod2, Fargs->tab_w); else if (depth == 4) - DFT16_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + DFT16_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], Fargs->mod, Fargs->mod2, Fargs->tab_w); else if (depth == 5) - DFT32_NODE0_LAZY14(p, Fargs->mod, Fargs->mod2, Fargs->tab_w); + DFT32_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], + p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], + p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], + Fargs->mod, Fargs->mod2, Fargs->tab_w); else { const ulong len = UWORD(1) << depth; From c24e7b6a854781b2272cd599be09994a71ce68c3 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 14:19:54 +0100 Subject: [PATCH 44/71] starting precomp of tab_inverse(w) --- src/n_fft/ctx_init.c | 42 +++++++++++++++++++++++++++++++++++--- src/n_fft/profile/p-init.c | 1 + 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c index d984165bce..7e7785f656 100644 --- a/src/n_fft/ctx_init.c +++ b/src/n_fft/ctx_init.c @@ -9,9 +9,27 @@ (at your option) any later version. See . */ +#include "flint.h" #include "ulong_extras.h" #include "n_fft.h" +/** Given the precomputed quotient a_pr for modular multiplication by a mod n, + * a_pr == floor(a * 2**FLINT_BITS / n) + * where we assume 0 < a < n and n does not divide a * 2**FLINT_BITS, + * this returns the quotient for mulmod by -a mod n, + * floor( (n-a) * 2**FLINT_BITS / n) + * == 2**FLINT_BITS - ceil(a * 2**FLINT_BITS / n) + * == 2**FLINT_BITS - a_pr + * + * Note: the requirement "n does not divide a * 2**FLINT_BITS" follows + * from the other requirement 0 < a < n as soon as n is odd; in n_fft.h + * we will only use this for odd primes + */ +FLINT_FORCE_INLINE ulong n_mulmod_precomp_shoup_negate(ulong a_pr) +{ + return UWORD_MAX - a_pr; +} + void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, ulong p) { if (depth < 3) @@ -40,25 +58,42 @@ void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, } // at this stage, pr_quo and pr_rem are for k == 0 i.e. for I == tab_w2[0] - // fill tab_w for depth 3 + // fill tab_w and tab_iw for depth 3 ulong len = UWORD(1) << (depth-1); // len >= 4 F->tab_w = (nn_ptr) flint_malloc(2*len * sizeof(ulong)); + F->tab_iw = (nn_ptr) flint_malloc(2*len * sizeof(ulong)); + // w**0 == iw**0 == 1 F->tab_w[0] = UWORD(1); F->tab_w[1] = n_mulmod_precomp_shoup(UWORD(1), p); + F->tab_iw[0] = UWORD(1); + F->tab_iw[1] = F->tab_w[1]; + + // w**(L/4) == I and iw**(L/4) == -I, L == 2**max_depth F->tab_w[2] = F->tab_w2[0]; F->tab_w[3] = F->tab_w2[1]; + F->tab_iw[2] = p - F->tab_w2[0]; + F->tab_iw[3] = n_mulmod_precomp_shoup_negate(F->tab_w2[1]); + + + // w**(L/8) == J and w**(3L/8) == I*J F->tab_w[4] = F->tab_w2[2]; F->tab_w[5] = F->tab_w2[3]; n_mulmod_and_precomp_shoup(F->tab_w+6, F->tab_w+7, F->tab_w2[0], F->tab_w2[2], pr_quo, pr_rem, F->tab_w2[3], p); + // iw**(L/8) == -I*J and iw**(3L/8) == -J + F->tab_iw[4] = p - F->tab_w[6]; + F->tab_iw[5] = n_mulmod_precomp_shoup_negate(F->tab_w[7]); + F->tab_iw[6] = p - F->tab_w[4]; + F->tab_iw[7] = n_mulmod_precomp_shoup_negate(F->tab_w[5]); + // complete tab_w up to specified depth n_fft_ctx_fit_depth(F, depth); } void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p) { - FLINT_ASSERT(p > 2 && flint_clz(p) >= 2); // 2 < p < 2**62 + FLINT_ASSERT(p > 2 && flint_clz(p) >= 2); // 2 < p < 2**(FLINT_BITS-2) FLINT_ASSERT(flint_ctz(p - UWORD(1)) >= 3); // p-1 divisible by 8 // find the constant and exponent such that p == c * 2**max_depth + 1 @@ -76,6 +111,7 @@ void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p) void n_fft_ctx_clear(n_fft_ctx_t F) { flint_free(F->tab_w); + flint_free(F->tab_iw); } void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth) @@ -91,7 +127,7 @@ void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth) // tab_w[2] is w**(L/8) * tab_w[0], where L = 2**max_depth, // tab_w[2*4,2*6] is w**(L/16) * tab_w[2*0,2*2], // tab_w[2*8,2*10,2*12,2*14] is w**(L/32) * tab_w[2*0,2*2,2*4,2*6], etc. - // recall tab_w2[2*d] == w**(L / 2**(d+2)) + // recall tab_w2[2*k] == w**(L / 2**(k+2)) ulong d = F->depth - 1; ulong llen = UWORD(1) << (F->depth-1); ulong ww, pr_quo, pr_rem; diff --git a/src/n_fft/profile/p-init.c b/src/n_fft/profile/p-init.c index c79e349f7e..62fb82d30d 100644 --- a/src/n_fft/profile/p-init.c +++ b/src/n_fft/profile/p-init.c @@ -105,6 +105,7 @@ int main() printf(" t_unit == raw time divided by 2**depth * clock scale factor\n"); printf("\n"); + printf(" \t 20 bits \t 31 bits \t 42 bits \t 50 bits \t 60 bits \n"); printf("depth\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\tt_raw | t_unit\n"); // TODO fix for FLINT_BITS==32 From e0ace33eda3ca61661a122759c98a390fca53777 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 15:08:33 +0100 Subject: [PATCH 45/71] add tab_iw in fitdepth --- src/n_fft.h | 3 --- src/n_fft/ctx_init.c | 8 ++++++-- src/n_fft/test/t-init.c | 30 ++++++++++++++++++++---------- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index caa63925e3..734e10e347 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -41,7 +41,6 @@ typedef struct nn_ptr tab_w; // tabulated powers of w, see below nn_ptr tab_iw; // tabulated powers of 1/w, see below ulong tab_w2[128]; // powers w**(2**k), see below - ulong tab_iw2[128]; // powers iw**(2**k), see below } n_fft_ctx_struct; typedef n_fft_ctx_struct n_fft_ctx_t[1]; @@ -65,8 +64,6 @@ typedef n_fft_ctx_struct n_fft_ctx_t[1]; * where I is a square root of -1 and J is a square root of I */ -/** tab_iw2: same as tab_w2 but for the primitive root 1/w */ - /** tab_w: * - length 2**depth * - contains 2**(depth-1) first powers of w in (max_depth-1)-bit reversed order, diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c index 7e7785f656..59f0bcedbb 100644 --- a/src/n_fft/ctx_init.c +++ b/src/n_fft/ctx_init.c @@ -138,9 +138,13 @@ void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth) pr_rem = n_mulmod_precomp_shoup_rem_from_quo(pr_quo, F->mod); // for each k, tab_w[2*(k+llen)] <- ww * tab_w[2*k], and deduce precomputation for (ulong k = 0; k < llen; k++) + { n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*k, F->tab_w + 2*llen + 2*k+1, - ww, F->tab_w[2*k], - pr_quo, pr_rem, F->tab_w[2*k+1], F->mod); + ww, F->tab_w[2*k], + pr_quo, pr_rem, F->tab_w[2*k+1], F->mod); + F->tab_iw[2*llen + 2*(llen-1-k)] = F->mod - F->tab_w[2*llen + 2*k]; + F->tab_iw[2*llen + 2*(llen-1-k) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*k+1]); + } } F->depth = depth; } diff --git a/src/n_fft/test/t-init.c b/src/n_fft/test/t-init.c index de7038a3bc..97a268400f 100644 --- a/src/n_fft/test/t-init.c +++ b/src/n_fft/test/t-init.c @@ -9,8 +9,10 @@ (at your option) any later version. See . */ +#include "nmod.h" #include "test_helpers.h" #include "ulong_extras.h" +#include "nmod_vec.h" #include "n_fft.h" // return bit reversal index of k for given nbits: @@ -47,10 +49,13 @@ int test_one(n_fft_ctx_t F, ulong max_depth, ulong depth, ulong p, flint_rand_t if (F->depth != depth) return 5; + // retrieve primitive root and its inverse + const ulong w = F->tab_w2[2*(max_depth-2)]; + const ulong iw = n_invmod(w, p); + // check the primitive root - ulong w = F->tab_w2[2*(max_depth-2)]; if (n_powmod2(w, UWORD(1)<depth - 1)); - ulong wk = F->tab_w[2*k]; - ulong exp = br_index(k, F->max_depth - 1); + + ulong wk = F->tab_w[2*k]; if (wk != n_powmod2(w, exp, p)) return 9; - if (F->tab_w[2*k+1] != n_mulmod_precomp_shoup(wk, p)) return 10; + + ulong iwk = F->tab_iw[2*k]; + if (iwk != n_powmod2(iw, exp, p)) + return 11; + if (F->tab_iw[2*k+1] != n_mulmod_precomp_shoup(iwk, p)) + return 12; } return 0; @@ -89,11 +99,11 @@ TEST_FUNCTION_START(n_fft_ctx_init2, state) ulong p, max_depth; if (i % 20 != 0) { - // take random prime in [17, 2**(FLINT_BITS-3)) + // take random prime in [17, 2**(FLINT_BITS-2)) #if FLINT_BITS == 64 - ulong bits = 5 + n_randint(state, 57); + ulong bits = 5 + n_randint(state, 58); #else - ulong bits = 5 + n_randint(state, 24); + ulong bits = 5 + n_randint(state, 25); #endif p = n_randprime(state, bits, 1); max_depth = flint_ctz(p-1); @@ -126,7 +136,7 @@ TEST_FUNCTION_START(n_fft_ctx_init2, state) // init n_fft_ctx_t F; n_fft_ctx_init2(F, depth, p); - + int res = test_one(F, max_depth, depth, p, state); if (res) From b376f7e2e8fa708454dc6641a36f1204c7b36560 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 15:32:55 +0100 Subject: [PATCH 46/71] unroll a bit is faster --- src/n_fft/ctx_init.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c index 59f0bcedbb..2bace830ad 100644 --- a/src/n_fft/ctx_init.c +++ b/src/n_fft/ctx_init.c @@ -123,6 +123,7 @@ void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth) { ulong len = UWORD(1) << (depth-1); // len >= 8 (since depth >= 4) F->tab_w = flint_realloc(F->tab_w, 2*len * sizeof(ulong)); + F->tab_iw = flint_realloc(F->tab_iw, 2*len * sizeof(ulong)); // tab_w[2] is w**(L/8) * tab_w[0], where L = 2**max_depth, // tab_w[2*4,2*6] is w**(L/16) * tab_w[2*0,2*2], @@ -137,15 +138,32 @@ void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth) pr_quo = F->tab_w2[2*d+1]; pr_rem = n_mulmod_precomp_shoup_rem_from_quo(pr_quo, F->mod); // for each k, tab_w[2*(k+llen)] <- ww * tab_w[2*k], and deduce precomputation - for (ulong k = 0; k < llen; k++) + for (ulong k = 0; k+3 < llen; k+=4) { - n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*k, F->tab_w + 2*llen + 2*k+1, - ww, F->tab_w[2*k], - pr_quo, pr_rem, F->tab_w[2*k+1], F->mod); - F->tab_iw[2*llen + 2*(llen-1-k)] = F->mod - F->tab_w[2*llen + 2*k]; - F->tab_iw[2*llen + 2*(llen-1-k) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*k+1]); + n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+0), F->tab_w + 2*llen + 2*(k+0)+1, + ww, F->tab_w[2*(k+0)], + pr_quo, pr_rem, F->tab_w[2*(k+0)+1], F->mod); + n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+1), F->tab_w + 2*llen + 2*(k+1)+1, + ww, F->tab_w[2*(k+1)], + pr_quo, pr_rem, F->tab_w[2*(k+1)+1], F->mod); + n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+2), F->tab_w + 2*llen + 2*(k+2)+1, + ww, F->tab_w[2*(k+2)], + pr_quo, pr_rem, F->tab_w[2*(k+2)+1], F->mod); + n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+3), F->tab_w + 2*llen + 2*(k+3)+1, + ww, F->tab_w[2*(k+3)], + pr_quo, pr_rem, F->tab_w[2*(k+3)+1], F->mod); + + F->tab_iw[2*llen + 2*(llen-1-(k+0))] = F->mod - F->tab_w[2*llen + 2*(k+0)]; + F->tab_iw[2*llen + 2*(llen-1-(k+0)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+0)+1]); + F->tab_iw[2*llen + 2*(llen-1-(k+1))] = F->mod - F->tab_w[2*llen + 2*(k+1)]; + F->tab_iw[2*llen + 2*(llen-1-(k+1)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+1)+1]); + F->tab_iw[2*llen + 2*(llen-1-(k+2))] = F->mod - F->tab_w[2*llen + 2*(k+2)]; + F->tab_iw[2*llen + 2*(llen-1-(k+2)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+2)+1]); + F->tab_iw[2*llen + 2*(llen-1-(k+3))] = F->mod - F->tab_w[2*llen + 2*(k+3)]; + F->tab_iw[2*llen + 2*(llen-1-(k+3)) + 1] = n_mulmod_precomp_shoup_negate(F->tab_w[2*llen + 2*(k+3)+1]); } } + F->depth = depth; } } From 0e0df84936744e335275a32a3d00af357b924e95 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 15:40:38 +0100 Subject: [PATCH 47/71] notes about init --- src/n_fft.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/n_fft.h b/src/n_fft.h index 734e10e347..cb13b61a11 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -23,8 +23,14 @@ extern "C" { /** * TODO[short term] augment precomputations with inverse roots * TODO[short term] add testing for general variants, not only node0 - * TODO[longer term] large depth can lead to heavy memory usage + * TODO[long term] large depth can lead to heavy memory usage * --> provide precomputation-free functions + * TODO[long term] on zen4 (likely on other cpus as well) ctx_init becomes + * slower at some point, losing a factor 4 or more, probably due to caching; + * what is annoying is that the depth where it becomes slower is significantly + * smaller (~13-14) when tab_iw has been incorporated compared to without + * tab_iw (it was depth ~20-21); see if this can be understood, and maybe play + * with vectorization for those simple functions * TODO[later] provide forward function which reduces output to [0..n) ? * unclear this is useful... to be decided later */ From 40539deb41418ce24e8aa72946bbda9910a960df Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 17:39:53 +0100 Subject: [PATCH 48/71] wip: use multipoint eval in test --- src/n_fft.h | 4 ++-- src/n_fft/test/t-dft.c | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index cb13b61a11..b177cc1ecd 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -144,8 +144,8 @@ void n_fft_set_args(n_fft_args_t F, ulong mod, nn_srcptr tab_w) */ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F); void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO -void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (idft on inverted roots) -void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (dft on inverted roots) +void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (idft on inverted roots, non-scaled) +void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (dft on inverted roots, scaled) diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c index 022d938a85..231033de67 100644 --- a/src/n_fft/test/t-dft.c +++ b/src/n_fft/test/t-dft.c @@ -9,6 +9,7 @@ (at your option) any later version. See . */ +#include "flint.h" #include "test_helpers.h" #include "ulong_extras.h" #include "nmod.h" @@ -79,6 +80,14 @@ TEST_FUNCTION_START(n_fft_dft, state) n_fft_ctx_t F; n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime); + // retrieve roots, used later for multipoint evaluation + nn_ptr roots = flint_malloc((UWORD(1) << MAX_EVAL_DEPTH) * sizeof(ulong)); + for (ulong k = 0; k < (UWORD(1) << (MAX_EVAL_DEPTH-1)); k++) + { + roots[2*k] = F->tab_w[2*k]; + roots[2*k+1] = prime - F->tab_w[2*k]; // < prime since F->tab_w[2*k] != 0 + } + for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++) { const ulong len = (UWORD(1) << depth); @@ -93,12 +102,8 @@ TEST_FUNCTION_START(n_fft_dft, state) if (len == 1) evals_br[0] = nmod_poly_evaluate_nmod(pol, UWORD(1)); else - for (ulong k = 0; k < len/2; k++) - { - ulong point = F->tab_w[2*k]; - evals_br[2*k] = nmod_poly_evaluate_nmod(pol, point); - evals_br[2*k+1] = nmod_poly_evaluate_nmod(pol, nmod_neg(point, mod)); - } + for (ulong k = 0; k < len; k++) + evals_br[k] = nmod_poly_evaluate_nmod(pol, roots[k]); // evals by DFT ulong * p = _nmod_vec_init(len); @@ -133,6 +138,7 @@ TEST_FUNCTION_START(n_fft_dft, state) _nmod_vec_clear(evals_br); } + flint_free(roots); n_fft_ctx_clear(F); } From 8c8b08ba46a9439b2553422da987d10abb85f226 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 17:47:22 +0100 Subject: [PATCH 49/71] use multipoint eval in test --- src/n_fft/test/t-dft.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c index 231033de67..653bc1efc6 100644 --- a/src/n_fft/test/t-dft.c +++ b/src/n_fft/test/t-dft.c @@ -17,7 +17,7 @@ #include "nmod_vec.h" #include "n_fft.h" -#define MAX_EVAL_DEPTH 10 +#define MAX_EVAL_DEPTH 11 // must be <= 12 // vector equality up to reduction mod static inline int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, nmod_t mod) @@ -55,7 +55,7 @@ TEST_FUNCTION_START(n_fft_dft, state) // take some FFT prime p with max_depth >= 12 ulong max_depth, prime; - // half of tests == large prime, close to limit + // half of tests == fixed large prime, close to limit // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1 // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1 if (i > 100) @@ -66,7 +66,7 @@ TEST_FUNCTION_START(n_fft_dft, state) #endif else { - max_depth = 12 + n_randint(state, 10); + max_depth = 12 + n_randint(state, 6); prime = 1 + (UWORD(1) << max_depth); while (! n_is_prime(prime)) prime += (UWORD(1) << max_depth); @@ -102,8 +102,7 @@ TEST_FUNCTION_START(n_fft_dft, state) if (len == 1) evals_br[0] = nmod_poly_evaluate_nmod(pol, UWORD(1)); else - for (ulong k = 0; k < len; k++) - evals_br[k] = nmod_poly_evaluate_nmod(pol, roots[k]); + nmod_poly_evaluate_nmod_vec(evals_br, pol, roots, len); // evals by DFT ulong * p = _nmod_vec_init(len); From b1fb6744abf715c73768039a93e36b26d1183112 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 18:16:18 +0100 Subject: [PATCH 50/71] idft_t --- src/n_fft.h | 42 ++++++++++++++++++++++++++++++++++++++++-- src/n_fft/dft.c | 30 ++++++++++++++---------------- 2 files changed, 54 insertions(+), 18 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index b177cc1ecd..0149b755dc 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -13,6 +13,8 @@ #define N_FFT_H #include "flint.h" +#include "nmod.h" +#include "nmod_vec.h" #define N_FFT_CTX_DEFAULT_DEPTH 12 @@ -142,10 +144,46 @@ void n_fft_set_args(n_fft_args_t F, ulong mod, nn_srcptr tab_w) * transforms / inverse transforms / transposed transforms * at length a power of 2 */ -void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F); +void dft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F); + +/** 2**depth-point DFT + * * in [0..n) / out [0..4n) / max < 4n + * * In-place transform p of length len == 2**depth into + * the concatenation of + * [sum(p[i] * w_k**i for i in range(len), sum(p[i] * (-w_k)**i for i in range(len)] + * for k in range(len), + * where w_k = F->tab_w[2*k] for 0 <= k < 2**(depth-1) + * * By construction these evaluation points are the roots of the polynomial + * x**len - 1, precisely they are all powers of the chosen len-th primitive + * root of unity with exponents listed in bit reversed order + * * Requirements (not checked): depth <= F.depth + */ +FLINT_FORCE_INLINE void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) +{ + n_fft_args_t Fargs; + n_fft_set_args(Fargs, F->mod, F->tab_w); + dft_node0_lazy14(p, depth, Fargs); +} + +FLINT_FORCE_INLINE void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F) +{ + n_fft_args_t Fargs; + n_fft_set_args(Fargs, F->mod, F->tab_iw); + dft_node0_lazy14(p, depth, Fargs); + + if (depth > 0) + { + nmod_t mod; + nmod_init(&mod, F->mod); + const ulong len = UWORD(1) << depth; + const ulong invlen = nmod_inv(len, mod); // TODO store? + _nmod_vec_scalar_mul_nmod(p, p, len, invlen, mod); + } +} + + void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (idft on inverted roots, non-scaled) -void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (dft on inverted roots, scaled) diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index ecbc57de99..8c1d93faab 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -767,36 +767,33 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_args_t F) * root of unity with exponents listed in bit reversed order * * Requirements (not checked): depth <= F.depth */ -void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) +void dft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F) { if (depth == 0) return; - n_fft_args_t Fargs; - n_fft_set_args(Fargs, F->mod, F->tab_w); - if (depth == 1) { ulong tmp; - DFT2_NODE0_LAZY12(p[0], p[1], Fargs->mod, tmp); + DFT2_NODE0_LAZY12(p[0], p[1], F->mod, tmp); } else if (depth == 2) { ulong p_hi, p_lo; - DFT4_NODE0_LAZY14(p[0], p[1], p[2], p[3], Fargs->tab_w[2], Fargs->tab_w[3], Fargs->mod, Fargs->mod2, p_hi, p_lo); + DFT4_NODE0_LAZY14(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); } else if (depth == 3) - DFT8_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], Fargs->mod, Fargs->mod2, Fargs->tab_w); + DFT8_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w); else if (depth == 4) DFT16_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], - Fargs->mod, Fargs->mod2, Fargs->tab_w); + F->mod, F->mod2, F->tab_w); else if (depth == 5) DFT32_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], - Fargs->mod, Fargs->mod2, Fargs->tab_w); + F->mod, F->mod2, F->tab_w); else { const ulong len = UWORD(1) << depth; @@ -811,15 +808,16 @@ void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) ulong p_hi, p_lo; for (ulong k = 0; k < len/4; k++) { - DFT4_NODE0_LAZY14(p0[k], p1[k], p2[k], p3[k], Fargs->tab_w[2], Fargs->tab_w[3], Fargs->mod, Fargs->mod2, p_hi, p_lo); - if (p0[k] >= Fargs->mod2) - p0[k] -= Fargs->mod2; + DFT4_NODE0_LAZY14(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + if (p0[k] >= F->mod2) + p0[k] -= F->mod2; } // 4 recursive calls with depth-2 - dft_node0_lazy24(p0, depth-2, Fargs); - dft_lazy44(p1, depth-2, 1, Fargs); - dft_lazy44(p2, depth-2, 2, Fargs); - dft_lazy44(p3, depth-2, 3, Fargs); + dft_node0_lazy24(p0, depth-2, F); + dft_lazy44(p1, depth-2, 1, F); + dft_lazy44(p2, depth-2, 2, F); + dft_lazy44(p3, depth-2, 3, F); } } + From dcf2ae70815e8e84887486a28a8a39b7e15a2f64 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 20:26:38 +0100 Subject: [PATCH 51/71] idft_t, not tested yet --- src/n_fft.h | 73 ++++++++++++++++++++++++++++---------- src/n_fft/ctx_init.c | 17 ++++++--- src/n_fft/profile/p-dft.c | 28 +++++++-------- src/n_fft/profile/p-init.c | 5 +-- src/n_fft/test/t-init.c | 34 +++++++++++------- 5 files changed, 103 insertions(+), 54 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 0149b755dc..71ee9067f4 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -15,6 +15,7 @@ #include "flint.h" #include "nmod.h" #include "nmod_vec.h" +#include "ulong_extras.h" #define N_FFT_CTX_DEFAULT_DEPTH 12 @@ -43,29 +44,32 @@ extern "C" { typedef struct { - ulong mod; // modulus, odd prime - ulong max_depth; // maximum supported depth (w has order 2**max_depth) - ulong depth; // depth supported by current precomputation - nn_ptr tab_w; // tabulated powers of w, see below - nn_ptr tab_iw; // tabulated powers of 1/w, see below - ulong tab_w2[128]; // powers w**(2**k), see below + ulong mod; // modulus, odd prime + ulong max_depth; // maximum supported depth (w has order 2**max_depth) + ulong cofactor; // prime is 1 + cofactor * 2**max_depth + ulong depth; // depth supported by current precomputation + nn_ptr tab_w; // tabulated powers of w, see below + nn_ptr tab_iw; // tabulated powers of 1/w, see below + ulong tab_w2[2*FLINT_BITS]; // tabulated powers w**(2**k), see below + ulong tab_inv2[2*FLINT_BITS]; // tabulated inverses of 2**k, see below } n_fft_ctx_struct; typedef n_fft_ctx_struct n_fft_ctx_t[1]; + /** Requirements (not checked upon init): * - mod is an odd prime < 2**(FLINT_BITS-2) * - max_depth must be >= 3 (so, 8 must divide mod - 1) - * Total memory cost of precomputations for arrays tab_{w,iw}: - * at most 2 * (128 + 2**depth) ulong's + * Total memory cost of precomputations for arrays tab_{w,iw,w2,inv2}: + * at most 2 * (2*FLINT_BITS + 2**depth) ulong's */ /** tab_w2: - * - length 128, with undefined entries at index 2*max_depth and beyond + * - length 2*FLINT_BITS, with undefined entries at index 2*(max_depth-1) and beyond * - contains powers w**d for d a power of 2, and corresponding * precomputations for modular multiplication: * -- for 0 <= k < max_depth-1, tab_w2[2*k] = w**(2**(max_depth-2-k)) * and tab_w2[2*k+1] = floor(tab_w2[2*k] * 2**FLINT_BITS / mod) - * -- for 2*max_depth <= k < 128, tab_w2[k] is undefined + * -- for 2*(max_depth-1) <= k < 2*FLINT_BITS, tab_w2[k] is undefined * * --> one can retrieve w as tab_w2[2 * (max_depth-2)] * --> the first elements are tab_w2 = [I, I_pr, J, J_pr, ...] @@ -91,6 +95,27 @@ typedef n_fft_ctx_struct n_fft_ctx_t[1]; /** tab_iw: same as tab_w but for the primitive root 1/w */ +/** tab_inv2: + * - length 2*FLINT_BITS, with undefined entries at index 2*max_depth and beyond + * - contains the modular inverses of 2**k, and corresponding + * precomputations for modular multiplication: + * -- for 0 <= k < max_depth, tab_inv2[2*k] = the inverse of 2**(k+1) + * modulo mod, and tab_inv2[2*k+1] = floor(tab_inv2[2*k] * 2**FLINT_BITS / mod) + * -- for 2*max_depth <= k < 2*FLINT_BITS, tab_inv2[k] is undefined + * + * Recall F->mod == 1 + cofactor * 2**max_depth, so + * 1 == F->mod - cofactor * 2**(max_depth - k) * 2**k + * --> the inverse of 2**k in (0, F->mod) is + * F->mod - cofactor * 2**(max_depth - k), + * we do not really need to store it, but we want the precomputations as well + */ + + + + + + + /** Note for init functions, when depth is provided: * - if it is < 3, it is pretended that it is 3 * - it it is more than F->max_depth (the maximum possible with the given @@ -99,15 +124,15 @@ typedef n_fft_ctx_struct n_fft_ctx_t[1]; */ // initialize with given root and given depth -void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, ulong mod); +void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong cofactor, ulong depth, ulong mod); // find primitive root, initialize with given depth void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p); // same, with default depth FLINT_FORCE_INLINE -void n_fft_ctx_init_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong p) -{ n_fft_ctx_init2_root(F, w, max_depth, N_FFT_CTX_DEFAULT_DEPTH, p); } +void n_fft_ctx_init_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong cofactor, ulong p) +{ n_fft_ctx_init2_root(F, w, max_depth, cofactor, N_FFT_CTX_DEFAULT_DEPTH, p); } FLINT_FORCE_INLINE void n_fft_ctx_init(n_fft_ctx_t F, ulong p) @@ -119,6 +144,7 @@ void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth); void n_fft_ctx_clear(n_fft_ctx_t F); + typedef struct { ulong mod; // modulus, odd prime @@ -165,6 +191,8 @@ FLINT_FORCE_INLINE void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) dft_node0_lazy14(p, depth, Fargs); } +// FIXME in progress +// DOC. Note: output < n. FLINT_FORCE_INLINE void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F) { n_fft_args_t Fargs; @@ -173,15 +201,22 @@ FLINT_FORCE_INLINE void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F) if (depth > 0) { - nmod_t mod; - nmod_init(&mod, F->mod); - const ulong len = UWORD(1) << depth; - const ulong invlen = nmod_inv(len, mod); // TODO store? - _nmod_vec_scalar_mul_nmod(p, p, len, invlen, mod); + const ulong inv2 = F->tab_inv2[2*depth-2]; + const ulong inv2_pr = F->tab_inv2[2*depth-1]; + //ulong p_hi, p_lo; + for (ulong k = 0; k < (UWORD(1) << depth); k++) + { + p[k] = n_mulmod_shoup(inv2, p[k], inv2_pr, F->mod); + //umul_ppmm(p_hi, p_lo, inv2_pr, p[k]); + //p[k] = inv2 * p[k] - p_hi * F->mod; + } + // NOTE: apparently no gain from lazy variant, so + // probably better to use non-lazy one } + // FIXME see if that can be made less expensive at least for depths not too + // small, by integrating into base cases of dft_node0 } - void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (idft on inverted roots, non-scaled) diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c index 2bace830ad..0e541c645b 100644 --- a/src/n_fft/ctx_init.c +++ b/src/n_fft/ctx_init.c @@ -30,7 +30,7 @@ FLINT_FORCE_INLINE ulong n_mulmod_precomp_shoup_negate(ulong a_pr) return UWORD_MAX - a_pr; } -void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, ulong p) +void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong cofactor, ulong depth, ulong p) { if (depth < 3) depth = 3; @@ -40,6 +40,7 @@ void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, // fill basic attributes F->mod = p; F->max_depth = max_depth; + F->cofactor = cofactor; F->depth = 3; // to be able to call fit_depth below // fill tab_w2 @@ -58,6 +59,13 @@ void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, } // at this stage, pr_quo and pr_rem are for k == 0 i.e. for I == tab_w2[0] + // fill tab_inv2 + for (ulong k = 0; k < max_depth; k++) + { + F->tab_inv2[2*k] = p - (cofactor << (max_depth - k-1)); + F->tab_inv2[2*k+1] = n_mulmod_precomp_shoup(F->tab_inv2[2*k], p); + } + // fill tab_w and tab_iw for depth 3 ulong len = UWORD(1) << (depth-1); // len >= 4 F->tab_w = (nn_ptr) flint_malloc(2*len * sizeof(ulong)); @@ -75,7 +83,6 @@ void n_fft_ctx_init2_root(n_fft_ctx_t F, ulong w, ulong max_depth, ulong depth, F->tab_iw[2] = p - F->tab_w2[0]; F->tab_iw[3] = n_mulmod_precomp_shoup_negate(F->tab_w2[1]); - // w**(L/8) == J and w**(3L/8) == I*J F->tab_w[4] = F->tab_w2[2]; F->tab_w[5] = F->tab_w2[3]; @@ -98,14 +105,14 @@ void n_fft_ctx_init2(n_fft_ctx_t F, ulong depth, ulong p) // find the constant and exponent such that p == c * 2**max_depth + 1 const ulong max_depth = flint_ctz(p - UWORD(1)); - const ulong c = (p - UWORD(1)) >> max_depth; + const ulong cofactor = (p - UWORD(1)) >> max_depth; // find primitive root w of order 2**max_depth const ulong prim_root = n_primitive_root_prime(p); - const ulong w = n_powmod2(prim_root, c, p); + const ulong w = n_powmod2(prim_root, cofactor, p); // fill all attributes and tables - n_fft_ctx_init2_root(F, w, max_depth, depth, p); + n_fft_ctx_init2_root(F, w, max_depth, cofactor, depth, p); } void n_fft_ctx_clear(n_fft_ctx_t F) diff --git a/src/n_fft/profile/p-dft.c b/src/n_fft/profile/p-dft.c index 733efa957d..f4948dc276 100644 --- a/src/n_fft/profile/p-dft.c +++ b/src/n_fft/profile/p-dft.c @@ -3,13 +3,12 @@ #include "fft_small.h" #include "n_fft.h" -#define num_primes 5 +#define num_primes 7 typedef struct { ulong prime; ulong depth; - ulong maxdepth; ulong stride; } info_t; @@ -19,24 +18,20 @@ void sample_##fun##_variant(void * arg, ulong count) info_t * info = (info_t *) arg; \ const ulong p = info->prime; \ const ulong depth = info->depth; \ - const ulong maxdepth = info->maxdepth; \ const ulong stride = info->stride; \ \ const ulong len = stride * (UWORD(1) << depth); \ const ulong rep = FLINT_MAX(1, FLINT_MIN(1000, 1000000/len)); \ \ /* modulus, roots of unity */ \ - nmod_t mod; \ - nmod_init(&mod, p); \ - ulong w0 = nmod_pow_ui(n_primitive_root_prime(p), (p - 1) >> maxdepth, mod); \ - ulong w = nmod_pow_ui(w0, UWORD(1)<<(maxdepth - depth), mod); \ n_fft_ctx_t F; \ - n_fft_ctx_init2_root(F, w, depth, depth, p); \ + n_fft_ctx_init2(F, depth, p); \ \ FLINT_TEST_INIT(state); \ \ ulong * coeffs = _nmod_vec_init(len); \ - _nmod_vec_randtest(coeffs, state, len, mod); \ + for (ulong k = 0; k < len; k++) \ + coeffs[k] = n_randint(state, p); \ \ for (ulong i = 0; i < count; i++) \ { \ @@ -51,6 +46,7 @@ void sample_##fun##_variant(void * arg, ulong count) } \ SAMPLE(dft, ) +SAMPLE(idft_t, ) //SAMPLE(n_fft_dft, _stride) void sample_sd_fft(void * arg, ulong count) @@ -97,17 +93,18 @@ int main() flint_printf("- timing DFT (length power of 2) for several bit lengths and depths\n"); flint_printf("depth\tsd_fft\trec4\n"); - // FIXME FLINT_BITS issue ulong primes[num_primes] = { 786433, // 20 bits, 1 + 2**18 * 3 + 1073479681, // 30 bits, 1 + 2**30 - 2**18 == 1 + 2**18 * (2**12 - 1) 2013265921, // 31 bits, 1 + 2**27 * 3 * 5 2748779069441, // 42 bits, 1 + 2**39 * 5 1108307720798209, // 50 bits, 1 + 2**44 * 3**2 * 7 1139410705724735489, // 60 bits, 1 + 2**52 * 11 * 23 + 4611686018427322369 // 62 bits: 1 + 2**62 - 2**16 == 1 + 2**16 * (2**46 - 1) }; - ulong max_depths[num_primes] = { 18, 25, 25, 25, 25 }; + ulong max_depths[num_primes] = { 18, 18, 25, 25, 25, 25, 16 }; - for (ulong k = 3; k < 4; k++) + for (ulong k = 4; k < 5; k++) { for (ulong depth = 3; depth <= max_depths[k]; depth++) { @@ -115,7 +112,6 @@ int main() info_t info; info.prime = primes[k]; - info.maxdepth = max_depths[k]; info.depth = depth; info.stride = 1; @@ -127,10 +123,12 @@ int main() prof_repeat(min+0, &max, sample_sd_fft, (void *) &info); prof_repeat(min+1, &max, sample_dft, (void *) &info); + prof_repeat(min+2, &max, sample_idft_t, (void *) &info); - flint_printf("%.1e\t%.1e\t\n", + flint_printf("%.1e\t%.1e\t%.1e\t\n", min[0]/(double)1000000/rep, - min[1]/(double)1000000/rep + min[1]/(double)1000000/rep, + min[2]/(double)1000000/rep ); } } diff --git a/src/n_fft/profile/p-init.c b/src/n_fft/profile/p-init.c index 62fb82d30d..f19117066a 100644 --- a/src/n_fft/profile/p-init.c +++ b/src/n_fft/profile/p-init.c @@ -36,7 +36,8 @@ void sample_init2_root(void * arg, ulong count) // modulus, roots of unity nmod_t mod; nmod_init(&mod, p); - ulong w0 = nmod_pow_ui(n_primitive_root_prime(p), (p - 1) >> maxdepth, mod); + ulong cofactor = (p - 1) >> maxdepth; + ulong w0 = nmod_pow_ui(n_primitive_root_prime(p), cofactor, mod); ulong w = nmod_pow_ui(w0, 1UL<<(maxdepth - depth), mod); FLINT_TEST_INIT(state); @@ -47,7 +48,7 @@ void sample_init2_root(void * arg, ulong count) for (ulong j = 0; j < rep; j++) { n_fft_ctx_t F; - n_fft_ctx_init2_root(F, w, depth, depth, p); + n_fft_ctx_init2_root(F, w, depth, cofactor, depth, p); n_fft_ctx_clear(F); } prof_stop(); diff --git a/src/n_fft/test/t-init.c b/src/n_fft/test/t-init.c index 97a268400f..30449469c6 100644 --- a/src/n_fft/test/t-init.c +++ b/src/n_fft/test/t-init.c @@ -9,10 +9,8 @@ (at your option) any later version. See . */ -#include "nmod.h" #include "test_helpers.h" #include "ulong_extras.h" -#include "nmod_vec.h" #include "n_fft.h" // return bit reversal index of k for given nbits: @@ -40,14 +38,14 @@ int test_one(n_fft_ctx_t F, ulong max_depth, ulong depth, ulong p, flint_rand_t if (F->mod != p) return 1; - //if (F->mod4 != 4*p) - // return 3; - if (F->max_depth != max_depth) - return 4; + return 2; + + if ((1 + (F->cofactor << max_depth)) != p) + return 3; if (F->depth != depth) - return 5; + return 4; // retrieve primitive root and its inverse const ulong w = F->tab_w2[2*(max_depth-2)]; @@ -56,16 +54,26 @@ int test_one(n_fft_ctx_t F, ulong max_depth, ulong depth, ulong p, flint_rand_t // check the primitive root if (n_powmod2(w, UWORD(1)<tab_w2[2*k]; if (w2 != n_powmod2(w, UWORD(1)<<(max_depth-2-k), p)) - return 7; + return 6; if (F->tab_w2[2*k+1] != n_mulmod_precomp_shoup(w2, p)) + return 7; + } + + // check all entries of tab_inv2 + for (ulong k = 0; k < max_depth; k++) + { + ulong inv2 = F->tab_inv2[2*k]; + if (inv2 != n_invmod((UWORD(1)<<(k+1)), p)) return 8; + if (F->tab_inv2[2*k+1] != n_mulmod_precomp_shoup(inv2, p)) + return 9; } // check a few random entries of tab_w and tab_iw @@ -76,15 +84,15 @@ int test_one(n_fft_ctx_t F, ulong max_depth, ulong depth, ulong p, flint_rand_t ulong wk = F->tab_w[2*k]; if (wk != n_powmod2(w, exp, p)) - return 9; - if (F->tab_w[2*k+1] != n_mulmod_precomp_shoup(wk, p)) return 10; + if (F->tab_w[2*k+1] != n_mulmod_precomp_shoup(wk, p)) + return 11; ulong iwk = F->tab_iw[2*k]; if (iwk != n_powmod2(iw, exp, p)) - return 11; - if (F->tab_iw[2*k+1] != n_mulmod_precomp_shoup(iwk, p)) return 12; + if (F->tab_iw[2*k+1] != n_mulmod_precomp_shoup(iwk, p)) + return 13; } return 0; From 9d8845d158f24a9b3a18af2c41073a91c4645ec4 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 21:25:20 +0100 Subject: [PATCH 52/71] minor changes --- src/n_fft.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index 71ee9067f4..fe6614b789 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -170,6 +170,7 @@ void n_fft_set_args(n_fft_args_t F, ulong mod, nn_srcptr tab_w) * transforms / inverse transforms / transposed transforms * at length a power of 2 */ + void dft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F); /** 2**depth-point DFT @@ -192,6 +193,15 @@ FLINT_FORCE_INLINE void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) } // FIXME in progress +// not tested yet --> test == applying dft yields identity +void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO + +// FIXME in progress +// not tested yet --> test == naive version? +void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (idft on inverted roots, non-scaled) + +// FIXME in progress +// not tested yet --> test == applying dft_t yields identity? // DOC. Note: output < n. FLINT_FORCE_INLINE void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F) { @@ -211,16 +221,12 @@ FLINT_FORCE_INLINE void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F) //p[k] = inv2 * p[k] - p_hi * F->mod; } // NOTE: apparently no gain from lazy variant, so - // probably better to use non-lazy one + // probably better to use non-lazy one (ensures output < n) } // FIXME see if that can be made less expensive at least for depths not too // small, by integrating into base cases of dft_node0 } -void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO -void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (idft on inverted roots, non-scaled) - - From a872720bc9318202d7f6e6317c0427d97297a6a1 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 22:39:28 +0100 Subject: [PATCH 53/71] progress --- src/n_fft.h | 37 +++++++++++---- src/n_fft/dft.c | 101 ++++++++++++++--------------------------- src/n_fft/test/main.c | 2 + src/n_fft/test/t-dft.c | 6 +-- 4 files changed, 67 insertions(+), 79 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index fe6614b789..593b7d6d93 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -194,7 +194,32 @@ FLINT_FORCE_INLINE void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) // FIXME in progress // not tested yet --> test == applying dft yields identity -void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO +void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F); +FLINT_FORCE_INLINE void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F) +{ + n_fft_args_t Fargs; + n_fft_set_args(Fargs, F->mod, F->tab_iw); + idft_node0_lazy12(p, depth, Fargs); + + if (depth > 0) + { + const ulong inv2 = F->tab_inv2[2*depth-2]; + const ulong inv2_pr = F->tab_inv2[2*depth-1]; + //ulong p_hi, p_lo; + for (ulong k = 0; k < (UWORD(1) << depth); k++) + { + p[k] = n_mulmod_shoup(inv2, p[k], inv2_pr, F->mod); + //umul_ppmm(p_hi, p_lo, inv2_pr, p[k]); + //p[k] = inv2 * p[k] - p_hi * F->mod; + } + // NOTE: apparently no gain from lazy variant, so + // probably better to use non-lazy one (ensures output < n) + } + // FIXME see if that can be made less expensive at least for depths not too + // small, by integrating into base cases of dft_node0 +} + + // FIXME in progress // not tested yet --> test == naive version? @@ -211,20 +236,12 @@ FLINT_FORCE_INLINE void n_fft_idft_t(nn_ptr p, ulong depth, n_fft_ctx_t F) if (depth > 0) { + // see comments in idft concerning this loop const ulong inv2 = F->tab_inv2[2*depth-2]; const ulong inv2_pr = F->tab_inv2[2*depth-1]; - //ulong p_hi, p_lo; for (ulong k = 0; k < (UWORD(1) << depth); k++) - { p[k] = n_mulmod_shoup(inv2, p[k], inv2_pr, F->mod); - //umul_ppmm(p_hi, p_lo, inv2_pr, p[k]); - //p[k] = inv2 * p[k] - p_hi * F->mod; - } - // NOTE: apparently no gain from lazy variant, so - // probably better to use non-lazy one (ensures output < n) } - // FIXME see if that can be made less expensive at least for depths not too - // small, by integrating into base cases of dft_node0 } diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index 8c1d93faab..fcbdc4a558 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -11,57 +11,12 @@ #include "longlong.h" #include "n_fft.h" - -/*---------*/ -/* helpers */ -/*---------*/ - -/** Shoup's modular multiplication with precomputation, lazy - * (does not perform the excess correction step) - * --> computes either r or r+n and store it is res, where r = (a*b) % n - * --> a_pr is the precomputation for n, p_hi and p_lo are temporaries - * --> requires nbits(n) < FLINT_BITS - */ -#define N_MULMOD_PRECOMP_LAZY(res, a, b, a_pr, n, p_hi, p_lo) \ - do { \ - umul_ppmm(p_hi, p_lo, (a_pr), (b)); \ - res = (a) * (b) - p_hi * (n); \ - } while(0) +#include "basic.c" /*-------------*/ /* 2-point DFT */ /*-------------*/ -/** Cooley-Tukey butterfly, node 0 - * * in [0..n) x [0..n) / out [0..2n) x [0..2n) / max < 2n - * * In-place transform - * [1 1] - * [a b] <- [a b] [1 -1] - * * n is the modulus, tmp is a temporary - */ -#define DFT2_NODE0_LAZY12(a, b, n, tmp) \ - do { \ - tmp = (b); \ - (b) = (a) + (n) - tmp; \ - (a) = (a) + tmp; \ - } while(0) - -/** Cooley-Tukey butterfly, node 0 - * * in [0..2n) x [0..2n) / out [0..2n) x [0..4n) / max < 4n - * * In-place transform - * [1 1] - * [a b] <- [a b] [1 -1] - * * n2 is 2*n, tmp is a temporary - */ -#define DFT2_NODE0_LAZY24(a, b, n2, tmp) \ - do { \ - tmp = (b); \ - (b) = (a) + (n2) - tmp; \ - (a) = (a) + tmp; \ - if ((a) >= (n2)) \ - (a) -= (n2); \ - } while(0) - /** Cooley-Tukey butterfly, general * * in [0..4n) / out [0..4n) / max < 4n * * In-place transform @@ -81,7 +36,6 @@ (b) = u + (n2) - v; /* [0..4n) */ \ } while(0) - /*-------------*/ /* 4-point DFT */ /*-------------*/ @@ -162,9 +116,9 @@ * x^2 - w1 x^2 + w1 * / \ / \ * x - w2 x + w2 x - w3 x + w3 - * typically w2**2 == w1 and w3 == I*w2 (hence w3**2 == -w1) so that this - * really is the subproduct tree built from the four roots - * w2, -w2, I*w2, -I*w2 of x**4 - w1 + * typically w2**2 == w1 and w3 == I*w2 (hence w3**2 == -w1) so that the above + * is a Vandermonde matrix and this tree really is the subproduct tree built + * from the four roots w2, -w2, I*w2, -I*w2 of x**4 - w1 */ #define DFT4_LAZY44(a, b, c, d, \ w1, w1_pr, w2, w2_pr, w3, w3_pr, \ @@ -657,17 +611,23 @@ do { void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) { if (depth == 3) + { DFT8_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], node, F->mod, F->mod2, F->tab_w); + } else if (depth == 4) + { DFT16_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], node, F->mod, F->mod2, F->tab_w); + } else if (depth == 5) + { DFT32_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], node, F->mod, F->mod2, F->tab_w); + } else { const ulong len = UWORD(1) << depth; @@ -717,17 +677,23 @@ void dft_lazy44(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_args_t F) { if (depth == 3) + { DFT8_NODE0_LAZY24(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w); + } else if (depth == 4) + { DFT16_NODE0_LAZY24(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], F->mod, F->mod2, F->tab_w); + } else if (depth == 5) + { DFT32_NODE0_LAZY24(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], F->mod, F->mod2, F->tab_w); + } else { const ulong len = UWORD(1) << depth; @@ -769,32 +735,21 @@ void dft_node0_lazy24(nn_ptr p, ulong depth, n_fft_args_t F) */ void dft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F) { - if (depth == 0) - return; - - if (depth == 1) + if (depth == 4) { - ulong tmp; - DFT2_NODE0_LAZY12(p[0], p[1], F->mod, tmp); - } - else if (depth == 2) - { - ulong p_hi, p_lo; - DFT4_NODE0_LAZY14(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); - } - else if (depth == 3) - DFT8_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w); - else if (depth == 4) DFT16_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], F->mod, F->mod2, F->tab_w); + } else if (depth == 5) + { DFT32_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16], p[17], p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25], p[26], p[27], p[28], p[29], p[30], p[31], F->mod, F->mod2, F->tab_w); - else + } + else if (depth > 5) { const ulong len = UWORD(1) << depth; @@ -819,5 +774,19 @@ void dft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F) dft_lazy44(p2, depth-2, 2, F); dft_lazy44(p3, depth-2, 3, F); } + else if (depth == 3) + { + DFT8_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], F->mod, F->mod2, F->tab_w); + } + else if (depth == 2) + { + ulong p_hi, p_lo; + DFT4_NODE0_LAZY14(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + } + else if (depth == 1) + { + ulong tmp; + DFT2_NODE0_LAZY12(p[0], p[1], F->mod, tmp); + } } diff --git a/src/n_fft/test/main.c b/src/n_fft/test/main.c index a03cd0faa0..296d96f361 100644 --- a/src/n_fft/test/main.c +++ b/src/n_fft/test/main.c @@ -13,6 +13,7 @@ #include "t-init.c" #include "t-dft.c" +#include "t-idft.c" /* Array of test functions ***************************************************/ @@ -20,6 +21,7 @@ test_struct tests[] = { TEST_FUNCTION(n_fft_ctx_init2), TEST_FUNCTION(n_fft_dft), + TEST_FUNCTION(n_fft_idft), }; /* main function *************************************************************/ diff --git a/src/n_fft/test/t-dft.c b/src/n_fft/test/t-dft.c index 653bc1efc6..d6c7bd66ec 100644 --- a/src/n_fft/test/t-dft.c +++ b/src/n_fft/test/t-dft.c @@ -20,7 +20,7 @@ #define MAX_EVAL_DEPTH 11 // must be <= 12 // vector equality up to reduction mod -static inline int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, nmod_t mod) +int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, nmod_t mod) { for (ulong k = 0; k < len; k++) { @@ -36,7 +36,7 @@ static inline int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, } // testing that all elements of "vec" are less than "bound" -static inline int nmod_vec_range(nn_srcptr vec, ulong len, ulong bound) +int nmod_vec_range(nn_srcptr vec, ulong len, ulong bound) { for (ulong k = 0; k < len; k++) if (vec[k] >= bound) @@ -97,7 +97,7 @@ TEST_FUNCTION_START(n_fft_dft, state) nmod_poly_init(pol, mod.n); nmod_poly_randtest(pol, state, len); - // naive evals by Horner, in bit reversed order + // evals via general multipoint evaluation nn_ptr evals_br = _nmod_vec_init(len); if (len == 1) evals_br[0] = nmod_poly_evaluate_nmod(pol, UWORD(1)); From c29a5d19677e54fdbaa6945c9dbd8a5153e1834e Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 22:40:17 +0100 Subject: [PATCH 54/71] add files --- src/n_fft/basic.c | 65 +++++++++++++++++ src/n_fft/idft.c | 155 ++++++++++++++++++++++++++++++++++++++++ src/n_fft/test/t-idft.c | 145 +++++++++++++++++++++++++++++++++++++ 3 files changed, 365 insertions(+) create mode 100644 src/n_fft/basic.c create mode 100644 src/n_fft/idft.c create mode 100644 src/n_fft/test/t-idft.c diff --git a/src/n_fft/basic.c b/src/n_fft/basic.c new file mode 100644 index 0000000000..bf479513e5 --- /dev/null +++ b/src/n_fft/basic.c @@ -0,0 +1,65 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#ifndef N_FFT_BASIC_H +#define N_FFT_BASIC_H + +/*---------*/ +/* helpers */ +/*---------*/ + +/** Shoup's modular multiplication with precomputation, lazy + * (does not perform the excess correction step) + * --> computes either r or r+n and store it is res, where r = (a*b) % n + * --> a_pr is the precomputation for n, p_hi and p_lo are temporaries + * --> requires nbits(n) < FLINT_BITS + */ +#define N_MULMOD_PRECOMP_LAZY(res, a, b, a_pr, n, p_hi, p_lo) \ + do { \ + umul_ppmm(p_hi, p_lo, (a_pr), (b)); \ + res = (a) * (b) - p_hi * (n); \ + } while(0) + +/*-------------*/ +/* 2-point DFT */ +/*-------------*/ + +/** Butterfly, node 0 + * * in [0..n) x [0..n) / out [0..2n) x [0..2n) / max < 2n + * * In-place transform + * [1 1] + * [a b] <- [a b] [1 -1] + * * n is the modulus, tmp is a temporary + */ +#define DFT2_NODE0_LAZY12(a, b, n, tmp) \ + do { \ + tmp = (b); \ + (b) = (a) + (n) - tmp; \ + (a) = (a) + tmp; \ + } while(0) + +/** Butterfly, node 0 + * * in [0..2n) x [0..2n) / out [0..2n) x [0..4n) / max < 4n + * * In-place transform + * [1 1] + * [a b] <- [a b] [1 -1] + * * n2 is 2*n, tmp is a temporary + */ +#define DFT2_NODE0_LAZY24(a, b, n2, tmp) \ + do { \ + tmp = (b); \ + (b) = (a) + (n2) - tmp; \ + (a) = (a) + tmp; \ + if ((a) >= (n2)) \ + (a) -= (n2); \ + } while(0) + +#endif // N_FFT_BASIC_H diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c new file mode 100644 index 0000000000..99989ac66e --- /dev/null +++ b/src/n_fft/idft.c @@ -0,0 +1,155 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "longlong.h" +#include "n_fft.h" +#include "basic.c" + +/*---------*/ +/* helpers */ +/*---------*/ + +// FIXME repeated from dft.c, see about making common basic macros / defs file +/** Shoup's modular multiplication with precomputation, lazy + * (does not perform the excess correction step) + * --> computes either r or r+n and store it is res, where r = (a*b) % n + * --> a_pr is the precomputation for n, p_hi and p_lo are temporaries + * --> requires nbits(n) < FLINT_BITS + */ +#define N_MULMOD_PRECOMP_LAZY(res, a, b, a_pr, n, p_hi, p_lo) \ + do { \ + umul_ppmm(p_hi, p_lo, (a_pr), (b)); \ + res = (a) * (b) - p_hi * (n); \ + } while(0) + +/*--------------*/ +/* 2-point IDFT */ +/*--------------*/ + +/** Gentleman-Sande butterfly, general + * * in [0..2n) / out [0..2n) / max < 4n + * * In-place transform + * [1 w] + * [a b] <- [a b] [1 -w] + * * n2 is 2*n, iw_pr is the precomputed data for multiplication by iw mod n + * p_hi, p_lo, tmp are temporaries + * * can be seen as interpolation at points w = 1 / iw and -w, up to a scaling + * by 1/2, since the inverse of [1 iw] is 1/2 * [1 1] + * [1 -iw] [w -w] + */ +#define IDFT2_LAZY22(a, b, n, n2, w, w_pr, p_hi, p_lo, tmp) \ +do { \ + tmp = (a) + (n2) - (b); /* [0..4n) */ \ + (a) = (a) + (b); /* [0..4n) */ \ + if ((a) >= (n2)) \ + (a) -= (n2); /* [0..2n) */ \ + N_MULMOD_PRECOMP_LAZY((b), w, tmp, w_pr, n, p_hi, p_lo); \ + /* --> (b) in [0..2n) */ \ +} while(0) + +/*--------------*/ +/* 4-point IDFT */ +/*--------------*/ + +/** 4-point IDFT, general + * * in [0..2n) / out [0..2n) / max < 4n + * * In-place transform + * [1 w2 w1 w3] + * [1 -w2 I*w1 -I*w3] + * [a b c d] <- [a b c d] [1 w2 -w1 -w3] + * [1 -w2 -I*w1 I*w3] + * + * [1 1 ] [1 w2 ] + * [ 1 I] [1 -w2 ] + * == [a b c d] [1 -1 ] [ w1 w3] + * [ 1 -I] [ w1 -w3] + */ +#define IDFT4_LAZY22(a,b,c,d, \ + I,I_pr,w1,w1_pr,w2,w2_pr,w3,w3_pr, \ + n,n2,n4,p_hi,p_lo) \ +do { \ + const ulong u0 = (a); \ + const ulong u1 = (b); \ + const ulong u2 = (c); \ + const ulong u3 = (d); \ + \ + ulong u4 = u0 + u2; /* [0..4n) */ \ + ulong u5 = u0 + n2 - u2; /* [0..4n) */ \ + ulong u6 = u1 + u3; /* [0..4n) */ \ + ulong u7 = u1 + n2 - u3; /* [0..4n) */ \ + \ + N_MULMOD_PRECOMP_LAZY(u7, I, u7, I_pr, n, p_hi, p_lo); \ + \ + p_lo = u4 + u6; /* [0..8n) */ \ + if (p_lo >= n4) \ + p_lo -= n4; \ + if (p_lo >= n2) \ + p_lo -= n2; \ + (a) = p_lo; /* [0..2n) */ \ + \ + u4 = u4 + n4 - u6; \ + N_MULMOD_PRECOMP_LAZY((b), w2, u4, w2_pr, n, p_hi, p_lo); \ + u6 = u5 + u7; \ + N_MULMOD_PRECOMP_LAZY((c), w1, u6, w1_pr, n, p_hi, p_lo); \ + u5 = u5 + n2 - u7; \ + N_MULMOD_PRECOMP_LAZY((d), w3, u5, w3_pr, n, p_hi, p_lo); \ +} while(0) + +void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) +{ + if (depth == 1) + { + ulong p_hi, p_lo, tmp; + IDFT2_LAZY22(p[0], p[1], F->mod, F->mod2, F->tab_w[2*node], F->tab_w[2*node+1], p_hi, p_lo, tmp); + } + else + { + const ulong len = UWORD(1) << depth; + idft_lazy22(p, depth-1, 2*node, F); + idft_lazy22(p+len/2, depth-1, 2*node+1, F); + + const ulong w = F->tab_w[4*node]; + const ulong w_pr = F->tab_w[4*node+1]; + ulong p_hi, p_lo, tmp; + + for (ulong k = 0; k < len/2; k++) + { + IDFT2_LAZY22(p[k], p[len/2 + k], F->mod, F->mod2, w, w_pr, p_hi, p_lo, tmp); + } + } +} + +void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) +{ + if (depth == 0) + return; + + if (depth == 1) + { + ulong tmp; + DFT2_NODE0_LAZY12(p[0], p[1], F->mod, tmp); + } + else + { + const ulong len = UWORD(1) << depth; + idft_node0_lazy12(p, depth-1, F); + idft_lazy22(p+len/2, depth-1, 1, F); + + const ulong I = F->tab_w[0]; + const ulong I_pr = F->tab_w[1]; + ulong p_hi, p_lo, tmp; + + for (ulong k = 0; k < len/2; k++) + { + IDFT2_LAZY22(p[k], p[len/2 + k], F->mod, F->mod2, I, I_pr, p_hi, p_lo, tmp); + } + } +} diff --git a/src/n_fft/test/t-idft.c b/src/n_fft/test/t-idft.c new file mode 100644 index 0000000000..6ccc901798 --- /dev/null +++ b/src/n_fft/test/t-idft.c @@ -0,0 +1,145 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "flint.h" +#include "test_helpers.h" +#include "ulong_extras.h" +#include "nmod.h" +#include "nmod_poly.h" +#include "nmod_vec.h" +#include "n_fft.h" + +#define MAX_EVAL_DEPTH 11 // must be <= 12 + +// vector equality up to reduction mod +int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, nmod_t mod) +{ + for (ulong k = 0; k < len; k++) + { + ulong v1; + ulong v2; + NMOD_RED(v1, vec1[k], mod); + NMOD_RED(v2, vec2[k], mod); + if (v1 != v2) + return 0; + } + + return 1; +} + +// testing that all elements of "vec" are less than "bound" +int nmod_vec_range(nn_srcptr vec, ulong len, ulong bound) +{ + for (ulong k = 0; k < len; k++) + if (vec[k] >= bound) + return 0; + + return 1; +} + + +TEST_FUNCTION_START(n_fft_idft, state) +{ + int i; + + for (i = 0; i < 200 * flint_test_multiplier(); i++) + { + // take some FFT prime p with max_depth >= 12 + ulong max_depth, prime; + + // half of tests == fixed large prime, close to limit + // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1 + // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1 + if (i > 100000) // TODO +#if FLINT_BITS == 64 + prime = UWORD(4611686018427322369); +#else // FLINT_BITS == 32 + prime = UWORD(1073479681); +#endif + else + { + max_depth = 12 + n_randint(state, 6); + prime = 1 + (UWORD(1) << max_depth); + while (! n_is_prime(prime)) + prime += (UWORD(1) << max_depth); + } + max_depth = flint_ctz(prime-1); + + nmod_t mod; + nmod_init(&mod, prime); + + // init FFT root tables + n_fft_ctx_t F; + n_fft_ctx_init2(F, MAX_EVAL_DEPTH, prime); + + // retrieve roots, used later for multipoint evaluation + nn_ptr roots = flint_malloc((UWORD(1) << MAX_EVAL_DEPTH) * sizeof(ulong)); + for (ulong k = 0; k < (UWORD(1) << (MAX_EVAL_DEPTH-1)); k++) + { + roots[2*k] = F->tab_w[2*k]; + roots[2*k+1] = prime - F->tab_w[2*k]; // < prime since F->tab_w[2*k] != 0 + } + + for (ulong depth = 0; depth <= MAX_EVAL_DEPTH; depth++) + { + const ulong len = (UWORD(1) << depth); + + // choose random evals of degree == len + nn_ptr evals = flint_malloc(len * sizeof(ulong)); + for (ulong k = 0; k < len; k++) + evals[k] = n_randint(state, prime); + + // naive evals by Horner, in bit reversed order + nn_ptr evals_br = _nmod_vec_init(len); + if (len == 1) + evals_br[0] = nmod_poly_evaluate_nmod(pol, UWORD(1)); + else + nmod_poly_evaluate_nmod_vec(evals_br, pol, roots, len); + + // evals by DFT + ulong * p = _nmod_vec_init(len); + _nmod_vec_set(p, pol->coeffs, len); + + n_fft_dft(p, depth, F); + + int res = nmod_vec_red_equal(evals_br, p, len, mod); + + if (!res) + TEST_FUNCTION_FAIL( + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "failed equality test\n", + prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); + + res = nmod_vec_range(p, len, 4*mod.n); + + if (!res) + TEST_FUNCTION_FAIL( + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "failed range test\n", + prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); + + _nmod_vec_clear(p); + flint_free(evals); + _nmod_vec_clear(evals_br); + } + + flint_free(roots); + n_fft_ctx_clear(F); + } + + TEST_FUNCTION_END(state); +} From 66523a76f65c430a12255911876712cd091aa3fd Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 22:49:34 +0100 Subject: [PATCH 55/71] idft test passes --- src/n_fft/idft.c | 4 +- src/n_fft/test/t-idft.c | 117 ++++++++++++++++++++++------------------ 2 files changed, 67 insertions(+), 54 deletions(-) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index 99989ac66e..050e59800a 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -116,8 +116,8 @@ void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) idft_lazy22(p, depth-1, 2*node, F); idft_lazy22(p+len/2, depth-1, 2*node+1, F); - const ulong w = F->tab_w[4*node]; - const ulong w_pr = F->tab_w[4*node+1]; + const ulong w = F->tab_w[2*node]; + const ulong w_pr = F->tab_w[2*node+1]; ulong p_hi, p_lo, tmp; for (ulong k = 0; k < len/2; k++) diff --git a/src/n_fft/test/t-idft.c b/src/n_fft/test/t-idft.c index 6ccc901798..a29018bb8c 100644 --- a/src/n_fft/test/t-idft.c +++ b/src/n_fft/test/t-idft.c @@ -20,30 +20,30 @@ #define MAX_EVAL_DEPTH 11 // must be <= 12 // vector equality up to reduction mod -int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, nmod_t mod) -{ - for (ulong k = 0; k < len; k++) - { - ulong v1; - ulong v2; - NMOD_RED(v1, vec1[k], mod); - NMOD_RED(v2, vec2[k], mod); - if (v1 != v2) - return 0; - } - - return 1; -} - -// testing that all elements of "vec" are less than "bound" -int nmod_vec_range(nn_srcptr vec, ulong len, ulong bound) -{ - for (ulong k = 0; k < len; k++) - if (vec[k] >= bound) - return 0; - - return 1; -} +/* int nmod_vec_red_equal(nn_srcptr vec1, nn_srcptr vec2, ulong len, nmod_t mod) */ +/* { */ +/* for (ulong k = 0; k < len; k++) */ +/* { */ +/* ulong v1; */ +/* ulong v2; */ +/* NMOD_RED(v1, vec1[k], mod); */ +/* NMOD_RED(v2, vec2[k], mod); */ +/* if (v1 != v2) */ +/* return 0; */ +/* } */ + +/* return 1; */ +/* } */ + +/* // testing that all elements of "vec" are less than "bound" */ +/* int nmod_vec_range(nn_srcptr vec, ulong len, ulong bound) */ +/* { */ +/* for (ulong k = 0; k < len; k++) */ +/* if (vec[k] >= bound) */ +/* return 0; */ + +/* return 1; */ +/* } */ TEST_FUNCTION_START(n_fft_idft, state) @@ -97,44 +97,57 @@ TEST_FUNCTION_START(n_fft_idft, state) for (ulong k = 0; k < len; k++) evals[k] = n_randint(state, prime); - // naive evals by Horner, in bit reversed order - nn_ptr evals_br = _nmod_vec_init(len); - if (len == 1) - evals_br[0] = nmod_poly_evaluate_nmod(pol, UWORD(1)); - else - nmod_poly_evaluate_nmod_vec(evals_br, pol, roots, len); + // general interpolation + nmod_poly_t pol; + nmod_poly_init(pol, prime); + nmod_poly_interpolate_nmod_vec(pol, roots, evals, len); - // evals by DFT + // evals by IDFT ulong * p = _nmod_vec_init(len); - _nmod_vec_set(p, pol->coeffs, len); - - n_fft_dft(p, depth, F); + _nmod_vec_set(p, evals, len); - int res = nmod_vec_red_equal(evals_br, p, len, mod); - - if (!res) - TEST_FUNCTION_FAIL( - "prime = %wu\n" - "root of unity = %wu\n" - "max_depth = %wu\n" - "depth = %wu\n" - "failed equality test\n", - prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); + n_fft_idft(p, depth, F); - res = nmod_vec_range(p, len, 4*mod.n); + int res = _nmod_vec_equal(pol->coeffs, p, len); if (!res) + { + _nmod_vec_print(p, len, mod); + _nmod_vec_print(pol->coeffs, len, mod); TEST_FUNCTION_FAIL( - "prime = %wu\n" - "root of unity = %wu\n" - "max_depth = %wu\n" - "depth = %wu\n" - "failed range test\n", - prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); + "prime = %wu\n" + "root of unity = %wu\n" + "max_depth = %wu\n" + "depth = %wu\n" + "failed equality test\n", + prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); + } + + //int res = nmod_vec_red_equal(evals_br, p, len, mod); + + //if (!res) + // TEST_FUNCTION_FAIL( + // "prime = %wu\n" + // "root of unity = %wu\n" + // "max_depth = %wu\n" + // "depth = %wu\n" + // "failed equality test\n", + // prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); + + //res = nmod_vec_range(p, len, 4*mod.n); + + //if (!res) + // TEST_FUNCTION_FAIL( + // "prime = %wu\n" + // "root of unity = %wu\n" + // "max_depth = %wu\n" + // "depth = %wu\n" + // "failed range test\n", + // prime, F->tab_w2[2*(max_depth-2)], max_depth, depth); _nmod_vec_clear(p); flint_free(evals); - _nmod_vec_clear(evals_br); + nmod_poly_clear(pol); } flint_free(roots); From 453c068c1fb0d04ef2acf1a366d9bf2d6a02bec5 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sun, 27 Oct 2024 22:50:26 +0100 Subject: [PATCH 56/71] idft test passes --- src/n_fft/test/t-idft.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/n_fft/test/t-idft.c b/src/n_fft/test/t-idft.c index a29018bb8c..14c8e1ca93 100644 --- a/src/n_fft/test/t-idft.c +++ b/src/n_fft/test/t-idft.c @@ -58,7 +58,7 @@ TEST_FUNCTION_START(n_fft_idft, state) // half of tests == fixed large prime, close to limit // 62 bits: prime = 4611686018427322369 == 2**62 - 2**16 + 1 // 30 bits: prime = 1073479681 == 2**30 - 2**18 + 1 - if (i > 100000) // TODO + if (i > 100) #if FLINT_BITS == 64 prime = UWORD(4611686018427322369); #else // FLINT_BITS == 32 From 82d5cbfab521af3d649c3cf8bea5b5a4565362ed Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 00:41:26 +0100 Subject: [PATCH 57/71] idft in progress --- src/n_fft.h | 13 ++- src/n_fft/basic.c | 65 ------------ src/n_fft/dft.c | 83 ++------------- src/n_fft/idft.c | 138 +++++++++++++----------- src/n_fft/n_fft_macros.h | 213 ++++++++++++++++++++++++++++++++++++++ src/n_fft/profile/p-dft.c | 14 ++- 6 files changed, 318 insertions(+), 208 deletions(-) delete mode 100644 src/n_fft/basic.c create mode 100644 src/n_fft/n_fft_macros.h diff --git a/src/n_fft.h b/src/n_fft.h index 593b7d6d93..ffe57e7d4f 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -194,12 +194,13 @@ FLINT_FORCE_INLINE void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) // FIXME in progress // not tested yet --> test == applying dft yields identity -void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F); +// DOC. Note: output < n. +void idft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F); FLINT_FORCE_INLINE void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F) { n_fft_args_t Fargs; n_fft_set_args(Fargs, F->mod, F->tab_iw); - idft_node0_lazy12(p, depth, Fargs); + idft_node0_lazy14(p, depth, Fargs); if (depth > 0) { @@ -223,7 +224,13 @@ FLINT_FORCE_INLINE void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F) // FIXME in progress // not tested yet --> test == naive version? -void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F); // TODO (idft on inverted roots, non-scaled) +// DOC. Note: output < 2n (?). +FLINT_FORCE_INLINE void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F) +{ + n_fft_args_t Fargs; + n_fft_set_args(Fargs, F->mod, F->tab_w); + idft_node0_lazy14(p, depth, Fargs); +} // FIXME in progress // not tested yet --> test == applying dft_t yields identity? diff --git a/src/n_fft/basic.c b/src/n_fft/basic.c deleted file mode 100644 index bf479513e5..0000000000 --- a/src/n_fft/basic.c +++ /dev/null @@ -1,65 +0,0 @@ -/* - Copyright (C) 2024 Vincent Neiger - - This file is part of FLINT. - - FLINT is free software: you can redistribute it and/or modify it under - the terms of the GNU Lesser General Public License (LGPL) as published - by the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. See . -*/ - -#ifndef N_FFT_BASIC_H -#define N_FFT_BASIC_H - -/*---------*/ -/* helpers */ -/*---------*/ - -/** Shoup's modular multiplication with precomputation, lazy - * (does not perform the excess correction step) - * --> computes either r or r+n and store it is res, where r = (a*b) % n - * --> a_pr is the precomputation for n, p_hi and p_lo are temporaries - * --> requires nbits(n) < FLINT_BITS - */ -#define N_MULMOD_PRECOMP_LAZY(res, a, b, a_pr, n, p_hi, p_lo) \ - do { \ - umul_ppmm(p_hi, p_lo, (a_pr), (b)); \ - res = (a) * (b) - p_hi * (n); \ - } while(0) - -/*-------------*/ -/* 2-point DFT */ -/*-------------*/ - -/** Butterfly, node 0 - * * in [0..n) x [0..n) / out [0..2n) x [0..2n) / max < 2n - * * In-place transform - * [1 1] - * [a b] <- [a b] [1 -1] - * * n is the modulus, tmp is a temporary - */ -#define DFT2_NODE0_LAZY12(a, b, n, tmp) \ - do { \ - tmp = (b); \ - (b) = (a) + (n) - tmp; \ - (a) = (a) + tmp; \ - } while(0) - -/** Butterfly, node 0 - * * in [0..2n) x [0..2n) / out [0..2n) x [0..4n) / max < 4n - * * In-place transform - * [1 1] - * [a b] <- [a b] [1 -1] - * * n2 is 2*n, tmp is a temporary - */ -#define DFT2_NODE0_LAZY24(a, b, n2, tmp) \ - do { \ - tmp = (b); \ - (b) = (a) + (n2) - tmp; \ - (a) = (a) + tmp; \ - if ((a) >= (n2)) \ - (a) -= (n2); \ - } while(0) - -#endif // N_FFT_BASIC_H diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index fcbdc4a558..f2e6b72987 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -11,7 +11,7 @@ #include "longlong.h" #include "n_fft.h" -#include "basic.c" +#include "n_fft_macros.h" /*-------------*/ /* 2-point DFT */ @@ -40,69 +40,6 @@ /* 4-point DFT */ /*-------------*/ -/** 4-point DFT, node 0 - * * in [0..n) / out [0..4n) / max < 4n - * * In-place transform - * [1 1 1 1] - * [1 -1 I -I] - * [a b c d] <- [a b c d] [1 1 -1 -1] - * [1 -1 -I I] - * * Corresponds to reducing down the tree with nodes - * x^4 - 1 - * / \ - * x^2 - 1 x^2 + 1 - * / \ / \ - * x - 1 x + 1 x - I x + I - * where I is typically a square root of -1 - * (but this property is not exploited) - * * n is the modulus and n2 == 2*n, p_hi, p_lo are temporaries - */ -#define DFT4_NODE0_LAZY14(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ - do { \ - const ulong v0 = (a); \ - const ulong v1 = (b); \ - const ulong v2 = (c); \ - const ulong v3 = (d); \ - ulong v4 = v0 + v2; /* < 2*n */ \ - ulong v5 = v0 + (n) - v2; /* < 2*n */ \ - ulong v6 = v1 + v3; /* < 2*n */ \ - ulong v7; \ - N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n) - v3, (I_pr), (n), \ - p_hi, p_lo); \ - (a) = v4 + v6; /* < 4*n */ \ - (b) = v4 + (n2) - v6; /* < 4*n */ \ - (c) = v5 + v7; /* < 3*n */ \ - (d) = v5 + (n2) - v7; /* < 4*n */ \ - } while(0) - -/** 4-point DFT, node 0 - * * in [0..2n) / out [0..4n) / max < 4n - * * other than this, same specification as DFT4_NODE0_LAZY14 - */ -#define DFT4_NODE0_LAZY24(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ - do { \ - const ulong v0 = (a); \ - const ulong v1 = (b); \ - const ulong v2 = (c); \ - const ulong v3 = (d); \ - ulong v4 = v0 + v2; /* < 4*n */ \ - if (v4 >= (n2)) \ - v4 -= (n2); /* < 2*n */ \ - ulong v5 = v0 + (n2) - v2; /* < 4*n */ \ - if (v5 >= (n2)) \ - v5 -= (n2); /* < 2*n */ \ - ulong v6 = v1 + v3; /* < 4*n */ \ - if (v6 >= (n2)) \ - v6 -= (n2); /* < 2*n */ \ - ulong v7; \ - N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n2) - v3, (I_pr), (n), \ - p_hi, p_lo); \ - (a) = v4 + v6; /* < 4*n */ \ - (b) = v4 + (n2) - v6; /* < 4*n */ \ - (c) = v5 + v7; /* < 4*n */ \ - (d) = v5 + (n2) - v7; /* < 4*n */ \ - } while(0) - /** 4-point DFT, general * * in [0..4n) / out [0..4n) / max < 4n * * In-place transform @@ -176,10 +113,10 @@ do { \ do { \ ulong p_hi, p_lo, tmp; \ \ - DFT2_NODE0_LAZY12(p0, p4, mod, tmp); \ - DFT2_NODE0_LAZY12(p1, p5, mod, tmp); \ - DFT2_NODE0_LAZY12(p2, p6, mod, tmp); \ - DFT2_NODE0_LAZY12(p3, p7, mod, tmp); \ + BUTTERFLY_LAZY12(p0, p4, mod, tmp); \ + BUTTERFLY_LAZY12(p1, p5, mod, tmp); \ + BUTTERFLY_LAZY12(p2, p6, mod, tmp); \ + BUTTERFLY_LAZY12(p3, p7, mod, tmp); \ \ DFT4_NODE0_LAZY24(p0, p1, p2, p3, \ tab_w[2], tab_w[3], \ @@ -202,10 +139,10 @@ do { \ do { \ ulong p_hi, p_lo, tmp; \ \ - DFT2_NODE0_LAZY24(p0, p4, mod2, tmp); \ - DFT2_NODE0_LAZY24(p1, p5, mod2, tmp); \ - DFT2_NODE0_LAZY24(p2, p6, mod2, tmp); \ - DFT2_NODE0_LAZY24(p3, p7, mod2, tmp); \ + BUTTERFLY_LAZY24(p0, p4, mod2, tmp); \ + BUTTERFLY_LAZY24(p1, p5, mod2, tmp); \ + BUTTERFLY_LAZY24(p2, p6, mod2, tmp); \ + BUTTERFLY_LAZY24(p3, p7, mod2, tmp); \ \ DFT4_NODE0_LAZY24(p0, p1, p2, p3, \ tab_w[2], tab_w[3], \ @@ -786,7 +723,7 @@ void dft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F) else if (depth == 1) { ulong tmp; - DFT2_NODE0_LAZY12(p[0], p[1], F->mod, tmp); + BUTTERFLY_LAZY12(p[0], p[1], F->mod, tmp); } } diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index 050e59800a..c3be48bea0 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -11,24 +11,7 @@ #include "longlong.h" #include "n_fft.h" -#include "basic.c" - -/*---------*/ -/* helpers */ -/*---------*/ - -// FIXME repeated from dft.c, see about making common basic macros / defs file -/** Shoup's modular multiplication with precomputation, lazy - * (does not perform the excess correction step) - * --> computes either r or r+n and store it is res, where r = (a*b) % n - * --> a_pr is the precomputation for n, p_hi and p_lo are temporaries - * --> requires nbits(n) < FLINT_BITS - */ -#define N_MULMOD_PRECOMP_LAZY(res, a, b, a_pr, n, p_hi, p_lo) \ - do { \ - umul_ppmm(p_hi, p_lo, (a_pr), (b)); \ - res = (a) * (b) - p_hi * (n); \ - } while(0) +#include "n_fft_macros.h" /*--------------*/ /* 2-point IDFT */ @@ -37,8 +20,8 @@ /** Gentleman-Sande butterfly, general * * in [0..2n) / out [0..2n) / max < 4n * * In-place transform - * [1 w] - * [a b] <- [a b] [1 -w] + * [1 iw] + * [a b] <- [a b] [1 -iw] * * n2 is 2*n, iw_pr is the precomputed data for multiplication by iw mod n * p_hi, p_lo, tmp are temporaries * * can be seen as interpolation at points w = 1 / iw and -w, up to a scaling @@ -60,49 +43,63 @@ do { \ /*--------------*/ /** 4-point IDFT, general - * * in [0..2n) / out [0..2n) / max < 4n + * * in [0..4n) / out [0..4n) / max < 4n * * In-place transform - * [1 w2 w1 w3] - * [1 -w2 I*w1 -I*w3] - * [a b c d] <- [a b c d] [1 w2 -w1 -w3] - * [1 -w2 -I*w1 I*w3] - * - * [1 1 ] [1 w2 ] - * [ 1 I] [1 -w2 ] - * == [a b c d] [1 -1 ] [ w1 w3] - * [ 1 -I] [ w1 -w3] + * [ 1 iw2 iw1 iw1*iw2] + * [ 1 -iw2 iw1 -iw1*iw2] + * [a b c d] <- [a b c d] [ 1 w3 -iw1 -iw1*iw3] + * [ 1 -w3 -iw1 iw1*iw3] + * [1 iw2 0 0] [1 0 w1 0] + * == [a b c d] [1 -iw2 0 0] [0 1 0 w1] + * [0 0 1 iw3] [1 0 -w1 0] + * [0 0 1 -iw3] [0 1 0 -w1] + * * Corresponds, up to scaling by 1/4, to going up the tree with nodes + * x^4 - w1**2 + * / \ + * x^2 - w1 x^2 + w1 + * / \ / \ + * x - w2 x + w2 x - w3 x + w3 + * typically w2**2 == w1 and w3 == I*w2 (hence w3**2 == -w1) so that the above + * is the inverse of a Vandermonde matrix and this tree really is the + * subproduct tree built from the four roots w2, -w2, I*w2, -I*w2 of x**4 - w1 */ -#define IDFT4_LAZY22(a,b,c,d, \ - I,I_pr,w1,w1_pr,w2,w2_pr,w3,w3_pr, \ - n,n2,n4,p_hi,p_lo) \ +#define IDFT4_LAZY22(a, b, c, d, \ + w1, w1_pr, w2, w2_pr, w3, w3_pr, \ + n, n2, p_hi, p_lo) \ do { \ - const ulong u0 = (a); \ - const ulong u1 = (b); \ - const ulong u2 = (c); \ - const ulong u3 = (d); \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v1; /* < 4*n */ \ + if (v4 >= (n2)) \ + v4 -= (n2); /* < 2*n */ \ + ulong v5; \ + N_MULMOD_PRECOMP_LAZY(v5, (w2), v0 + (n2) - v1, (w2_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ + ulong v6 = v2 + v3; /* < 4*n */ \ + if (v6 >= (n2)) \ + v6 -= (n2); /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (w3), v2 + (n2) - v3, (w3_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ \ - ulong u4 = u0 + u2; /* [0..4n) */ \ - ulong u5 = u0 + n2 - u2; /* [0..4n) */ \ - ulong u6 = u1 + u3; /* [0..4n) */ \ - ulong u7 = u1 + n2 - u3; /* [0..4n) */ \ - \ - N_MULMOD_PRECOMP_LAZY(u7, I, u7, I_pr, n, p_hi, p_lo); \ - \ - p_lo = u4 + u6; /* [0..8n) */ \ - if (p_lo >= n4) \ - p_lo -= n4; \ - if (p_lo >= n2) \ - p_lo -= n2; \ - (a) = p_lo; /* [0..2n) */ \ - \ - u4 = u4 + n4 - u6; \ - N_MULMOD_PRECOMP_LAZY((b), w2, u4, w2_pr, n, p_hi, p_lo); \ - u6 = u5 + u7; \ - N_MULMOD_PRECOMP_LAZY((c), w1, u6, w1_pr, n, p_hi, p_lo); \ - u5 = u5 + n2 - u7; \ - N_MULMOD_PRECOMP_LAZY((d), w3, u5, w3_pr, n, p_hi, p_lo); \ + (a) = v4 + v6; \ + if ((a) >= (n2)) \ + (a) -= (n2); /* < 2*n */ \ + (b) = v5 + v7; \ + if ((b) >= (n2)) \ + (b) -= (n2); /* < 2*n */ \ + N_MULMOD_PRECOMP_LAZY((c), (w1), v4 + (n2) - v6, (w1_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ + N_MULMOD_PRECOMP_LAZY((d), (w1), v5 + (n2) - v7, (w1_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ } while(0) +/*--------------*/ +/* 4-point IDFT */ +/*--------------*/ + void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) { if (depth == 1) @@ -110,6 +107,15 @@ void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) ulong p_hi, p_lo, tmp; IDFT2_LAZY22(p[0], p[1], F->mod, F->mod2, F->tab_w[2*node], F->tab_w[2*node+1], p_hi, p_lo, tmp); } + else if (depth == 2) + { + ulong p_hi, p_lo; + IDFT4_LAZY22(p[0], p[1], p[2], p[3], + F->tab_w[2*node], F->tab_w[2*node+1], + F->tab_w[4*node], F->tab_w[4*node+1], + F->tab_w[4*node+2], F->tab_w[4*node+3], + F->mod, F->mod2, p_hi, p_lo); + } else { const ulong len = UWORD(1) << depth; @@ -127,7 +133,7 @@ void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) } } -void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) +void idft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F) { if (depth == 0) return; @@ -135,21 +141,27 @@ void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) if (depth == 1) { ulong tmp; - DFT2_NODE0_LAZY12(p[0], p[1], F->mod, tmp); + BUTTERFLY_LAZY12(p[0], p[1], F->mod, tmp); + } + else if (depth == 2) + { + ulong p_hi, p_lo; + IDFT4_NODE0_LAZY12(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); } else { const ulong len = UWORD(1) << depth; - idft_node0_lazy12(p, depth-1, F); + idft_node0_lazy14(p, depth-1, F); idft_lazy22(p+len/2, depth-1, 1, F); - const ulong I = F->tab_w[0]; - const ulong I_pr = F->tab_w[1]; + const ulong one = F->tab_w[0]; + const ulong one_pr = F->tab_w[1]; ulong p_hi, p_lo, tmp; for (ulong k = 0; k < len/2; k++) { - IDFT2_LAZY22(p[k], p[len/2 + k], F->mod, F->mod2, I, I_pr, p_hi, p_lo, tmp); + IDFT2_LAZY22(p[k], p[len/2 + k], F->mod, F->mod2, one, one_pr, p_hi, p_lo, tmp); } } } diff --git a/src/n_fft/n_fft_macros.h b/src/n_fft/n_fft_macros.h new file mode 100644 index 0000000000..cd37f858c6 --- /dev/null +++ b/src/n_fft/n_fft_macros.h @@ -0,0 +1,213 @@ +/* + Copyright (C) 2024 Vincent Neiger + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#ifndef N_FFT_MACROS_H +#define N_FFT_MACROS_H + +/*---------*/ +/* helpers */ +/*---------*/ + +/** Shoup's modular multiplication with precomputation, lazy + * (does not perform the excess correction step) + * --> computes either r or r+n and store it is res, where r = (a*b) % n + * --> a_pr is the precomputation for n, p_hi and p_lo are temporaries + */ +#define N_MULMOD_PRECOMP_LAZY(res, a, b, a_pr, n, p_hi, p_lo) \ + do { \ + umul_ppmm(p_hi, p_lo, (a_pr), (b)); \ + res = (a) * (b) - p_hi * (n); \ + } while(0) + +/*---------------------*/ +/* radix-2 butterflies */ +/*---------------------*/ + +/** Butterfly radix 2 + * * in [0..n) x [0..n) / out [0..2n) x [0..2n) / max < 2n + * * In-place transform + * [1 1] + * [a b] <- [a b] [1 -1] + * * n is the modulus, tmp is a temporary + */ +#define BUTTERFLY_LAZY12(a, b, n, tmp) \ + do { \ + tmp = (b); \ + (b) = (a) + (n) - tmp; \ + (a) = (a) + tmp; \ + } while(0) + +/** Butterfly radix 2 + * * in [0..2n) x [0..2n) / out [0..2n) x [0..4n) / max < 4n + * * In-place transform + * [1 1] + * [a b] <- [a b] [1 -1] + * * n2 is 2*n, tmp is a temporary + */ +#define BUTTERFLY_LAZY24(a, b, n2, tmp) \ + do { \ + tmp = (b); \ + (b) = (a) + (n2) - tmp; \ + (a) = (a) + tmp; \ + if ((a) >= (n2)) \ + (a) -= (n2); \ + } while(0) + +/*---------------------*/ +/* radix-4 butterflies */ +/*---------------------*/ + +/** 4-point butterfly, evaluation + * * in [0..n) / out [0..4n) / max < 4n + * * In-place transform + * [1 1 1 1] + * [1 -1 I -I] + * [a b c d] <- [a b c d] [1 1 -1 -1] + * [1 -1 -I I] + * [1 0 1 0] [1 1 0 0] + * == [a b c d] [0 1 0 I] [1 -1 0 0] + * [1 0 -1 0] [0 0 1 1] + * [0 1 0 -I] [0 0 1 -1] + * * Corresponds to reducing down the tree with nodes + * x^4 - 1 + * / \ + * x^2 - 1 x^2 + 1 + * / \ / \ + * x - 1 x + 1 x - I x + I + * where I is typically a square root of -1 + * (but this property is not exploited) + * * n is the modulus and n2 == 2*n, p_hi, p_lo are temporaries + */ +#define DFT4_NODE0_LAZY14(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ + do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v2; /* < 2*n */ \ + ulong v5 = v0 + (n) - v2; /* < 2*n */ \ + ulong v6 = v1 + v3; /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n) - v3, (I_pr), (n), \ + p_hi, p_lo); \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v4 + (n2) - v6; /* < 4*n */ \ + (c) = v5 + v7; /* < 3*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ + } while(0) + +/** 4-point butterfly, evaluation + * * in [0..2n) / out [0..4n) / max < 4n + * * other than this, same specification as DFT4_NODE0_LAZY14 + */ +#define DFT4_NODE0_LAZY24(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ + do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v2; /* < 4*n */ \ + if (v4 >= (n2)) \ + v4 -= (n2); /* < 2*n */ \ + ulong v5 = v0 + (n2) - v2; /* < 4*n */ \ + if (v5 >= (n2)) \ + v5 -= (n2); /* < 2*n */ \ + ulong v6 = v1 + v3; /* < 4*n */ \ + if (v6 >= (n2)) \ + v6 -= (n2); /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v1 + (n2) - v3, (I_pr), (n), \ + p_hi, p_lo); \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v4 + (n2) - v6; /* < 4*n */ \ + (c) = v5 + v7; /* < 4*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ + } while(0) + + +/** 4-point butterfly, interpolation + * * in [0..n) / out [0..4n) / max < 4n + * * In-place transform + * [1 1 1 1] + * [1 -1 1 -1] + * [a b c d] <- [a b c d] [1 -I -1 I] + * [1 I -1 -I] + * [1 1 0 0] [1 0 1 0] + * == [a b c d] [1 -1 0 0] [0 1 0 1] + * [0 0 1 I] [1 0 -1 0] + * [0 0 1 -I] [0 1 0 -1] + * + * * If I**2 == -1, this matrix is the inverse of the one above; this + * corresponds to interpolation at 1, -1, I, -I, up to scaling by 1/4; or to + * going up the tree with nodes + * x^4 - 1 + * / \ + * x^2 - 1 x^2 + 1 + * / \ / \ + * x - 1 x + 1 x - I x + I + */ +#define IDFT4_NODE0_LAZY12(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ +do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v1; /* < 2*n */ \ + ulong v5 = v0 + (n) - v1; /* < 2*n */ \ + ulong v6 = v2 + v3; /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n) - v3, (I_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ + (a) = v4 + v6; /* < 4*n */ \ + if ((a) >= (n2)) \ + (a) -= (n2); /* < 2*n */ \ + (b) = v5 + v7; /* < 4*n */ \ + if ((b) >= (n2)) \ + (b) -= (n2); /* < 2*n */ \ + (c) = v4 + (n2) - v6; /* < 4*n */ \ + if ((c) >= (n2)) \ + (c) -= (n2); /* < 2*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ + if ((d) >= (n2)) \ + (d) -= (n2); /* < 2*n */ \ +} while(0) + +/** 4-point butterfly, interpolation + * * in [0..2n) / out [0..4n) / max < 4n + * * other than this, same specification as IDFT4_NODE0_LAZY14 + */ +#define IDFT4_NODE0_LAZY24(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ +do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v1; /* < 4*n */ \ + if (v4 >= (n2)) \ + v4 -= (n2); /* < 2*n */ \ + ulong v5 = v0 + (n2) - v1; /* < 4*n */ \ + if (v5 >= (n2)) \ + v5 -= (n2); /* < 2*n */ \ + ulong v6 = v2 + v3; /* < 4*n */ \ + if (v6 >= (n2)) \ + v6 -= (n2); /* < 2*n */ \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n) - v3, (I_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v5 + v7; /* < 4*n */ \ + (c) = v4 + (n2) - v6; /* < 4*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ +} while(0) + + + + +#endif /* N_FFT_MACROS_H */ diff --git a/src/n_fft/profile/p-dft.c b/src/n_fft/profile/p-dft.c index f4948dc276..bfa4174faf 100644 --- a/src/n_fft/profile/p-dft.c +++ b/src/n_fft/profile/p-dft.c @@ -46,6 +46,8 @@ void sample_##fun##_variant(void * arg, ulong count) } \ SAMPLE(dft, ) +SAMPLE(idft, ) +SAMPLE(dft_t, ) SAMPLE(idft_t, ) //SAMPLE(n_fft_dft, _stride) @@ -91,7 +93,7 @@ int main() { flint_printf("- depth is log(fft length)\n"); flint_printf("- timing DFT (length power of 2) for several bit lengths and depths\n"); - flint_printf("depth\tsd_fft\trec4\n"); + flint_printf("depth\tsd_fft\tdft\tidft\tdft_t\tidft_t\n"); ulong primes[num_primes] = { 786433, // 20 bits, 1 + 2**18 * 3 @@ -123,12 +125,16 @@ int main() prof_repeat(min+0, &max, sample_sd_fft, (void *) &info); prof_repeat(min+1, &max, sample_dft, (void *) &info); - prof_repeat(min+2, &max, sample_idft_t, (void *) &info); + prof_repeat(min+2, &max, sample_idft, (void *) &info); + prof_repeat(min+3, &max, sample_dft_t, (void *) &info); + prof_repeat(min+4, &max, sample_idft_t, (void *) &info); - flint_printf("%.1e\t%.1e\t%.1e\t\n", + flint_printf("%.1e\t%.1e\t%.1e\t%.1e\t%.1e\t\n", min[0]/(double)1000000/rep, min[1]/(double)1000000/rep, - min[2]/(double)1000000/rep + min[2]/(double)1000000/rep, + min[3]/(double)1000000/rep, + min[4]/(double)1000000/rep ); } } From 07a7dbef97c93569f25b7bc11d739cc11a9c9c1c Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 00:42:50 +0100 Subject: [PATCH 58/71] fix name --- src/n_fft.h | 6 +++--- src/n_fft/idft.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index ffe57e7d4f..e3b8458669 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -195,12 +195,12 @@ FLINT_FORCE_INLINE void n_fft_dft(nn_ptr p, ulong depth, n_fft_ctx_t F) // FIXME in progress // not tested yet --> test == applying dft yields identity // DOC. Note: output < n. -void idft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F); +void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F); FLINT_FORCE_INLINE void n_fft_idft(nn_ptr p, ulong depth, n_fft_ctx_t F) { n_fft_args_t Fargs; n_fft_set_args(Fargs, F->mod, F->tab_iw); - idft_node0_lazy14(p, depth, Fargs); + idft_node0_lazy12(p, depth, Fargs); if (depth > 0) { @@ -229,7 +229,7 @@ FLINT_FORCE_INLINE void n_fft_dft_t(nn_ptr p, ulong depth, n_fft_ctx_t F) { n_fft_args_t Fargs; n_fft_set_args(Fargs, F->mod, F->tab_w); - idft_node0_lazy14(p, depth, Fargs); + idft_node0_lazy12(p, depth, Fargs); } // FIXME in progress diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index c3be48bea0..472a63294b 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -133,7 +133,7 @@ void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) } } -void idft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F) +void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) { if (depth == 0) return; @@ -152,7 +152,7 @@ void idft_node0_lazy14(nn_ptr p, ulong depth, n_fft_args_t F) else { const ulong len = UWORD(1) << depth; - idft_node0_lazy14(p, depth-1, F); + idft_node0_lazy12(p, depth-1, F); idft_lazy22(p+len/2, depth-1, 1, F); const ulong one = F->tab_w[0]; From f0fed077d5c886a2e7509ebe47ec769f311924da Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 01:26:14 +0100 Subject: [PATCH 59/71] idft in progress --- src/n_fft/idft.c | 68 +++++++++++++++++++++++++++++++++++----- src/n_fft/n_fft_macros.h | 2 +- 2 files changed, 61 insertions(+), 9 deletions(-) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index 472a63294b..d2d7514410 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -38,6 +38,18 @@ do { \ /* --> (b) in [0..2n) */ \ } while(0) +// move in macros? +// in [0..4n) x [0..2n) -> out [0..4n) x [0..4n) +// TODO rename +#define BUTTERFLY_LAZY22(a, b, n2, tmp) \ +do { \ + tmp = (a); \ + if (tmp >= (n2)) \ + tmp -= (n2); /* [0..2n) */ \ + (a) = tmp + (b); /* [0..4n) */ \ + (b) = tmp + (n2) - (b); /* [0..4n) */ \ +} while(0) + /*--------------*/ /* 4-point IDFT */ /*--------------*/ @@ -47,8 +59,8 @@ do { \ * * In-place transform * [ 1 iw2 iw1 iw1*iw2] * [ 1 -iw2 iw1 -iw1*iw2] - * [a b c d] <- [a b c d] [ 1 w3 -iw1 -iw1*iw3] - * [ 1 -w3 -iw1 iw1*iw3] + * [a b c d] <- [a b c d] [ 1 iw3 -iw1 -iw1*iw3] + * [ 1 -iw3 -iw1 iw1*iw3] * [1 iw2 0 0] [1 0 w1 0] * == [a b c d] [1 -iw2 0 0] [0 1 0 w1] * [0 0 1 iw3] [1 0 -w1 0] @@ -96,10 +108,53 @@ do { \ p_hi, p_lo); /* < 2*n */ \ } while(0) + /*--------------*/ -/* 4-point IDFT */ +/* 8-point IDFT */ /*--------------*/ +#define IDFT8_NODE0_LAZY12(p0, p1, p2, p3, p4, p5, p6, p7, \ + mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + \ + DFT4_NODE0_LAZY24(p0, p1, p2, p3, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + /* could use a lazy24 variant of the next macro, */ \ + /* but the gain is negligible */ \ + DFT4_LAZY44(p4, p5, p6, p7, \ + tab_w[2], tab_w[3], \ + tab_w[4], tab_w[5], \ + tab_w[6], tab_w[7], \ + mod, mod2, p_hi, p_lo, tmp); \ + \ + BUTTERFLY_LAZY12(p0, p4, mod, tmp); \ + BUTTERFLY_LAZY12(p1, p5, mod, tmp); \ + BUTTERFLY_LAZY12(p2, p6, mod, tmp); \ + BUTTERFLY_LAZY12(p3, p7, mod, tmp); \ +} while(0) + + + + + + + + + + + + + + + + +/*--------------*/ +/* general IDFT */ +/*--------------*/ + + void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) { if (depth == 1) @@ -155,13 +210,10 @@ void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) idft_node0_lazy12(p, depth-1, F); idft_lazy22(p+len/2, depth-1, 1, F); - const ulong one = F->tab_w[0]; - const ulong one_pr = F->tab_w[1]; - ulong p_hi, p_lo, tmp; - + ulong tmp; for (ulong k = 0; k < len/2; k++) { - IDFT2_LAZY22(p[k], p[len/2 + k], F->mod, F->mod2, one, one_pr, p_hi, p_lo, tmp); + BUTTERFLY_LAZY22(p[k], p[len/2 + k], F->mod2, tmp); } } } diff --git a/src/n_fft/n_fft_macros.h b/src/n_fft/n_fft_macros.h index cd37f858c6..57a3bc085f 100644 --- a/src/n_fft/n_fft_macros.h +++ b/src/n_fft/n_fft_macros.h @@ -165,7 +165,7 @@ do { \ ulong v6 = v2 + v3; /* < 2*n */ \ ulong v7; \ N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n) - v3, (I_pr), (n), \ - p_hi, p_lo); /* < 2*n */ \ + p_hi, p_lo); /* < 2*n */ \ (a) = v4 + v6; /* < 4*n */ \ if ((a) >= (n2)) \ (a) -= (n2); /* < 2*n */ \ From f79ba6f62d81ade47e97370f6b32e77426a7a0ec Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 01:37:58 +0100 Subject: [PATCH 60/71] idft in progress --- src/n_fft/idft.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index d2d7514410..31ceec8870 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -113,26 +113,26 @@ do { \ /* 8-point IDFT */ /*--------------*/ -#define IDFT8_NODE0_LAZY12(p0, p1, p2, p3, p4, p5, p6, p7, \ +// TODO see how to make lazier +#define IDFT8_NODE0_LAZY14(p0, p1, p2, p3, p4, p5, p6, p7, \ mod, mod2, tab_w) \ do { \ ulong p_hi, p_lo, tmp; \ \ - DFT4_NODE0_LAZY24(p0, p1, p2, p3, \ - tab_w[2], tab_w[3], \ - mod, mod2, p_hi, p_lo); \ - /* could use a lazy24 variant of the next macro, */ \ - /* but the gain is negligible */ \ - DFT4_LAZY44(p4, p5, p6, p7, \ + IDFT4_NODE0_LAZY12(p0, p1, p2, p3, \ + tab_w[2], tab_w[3], \ + mod, mod2, p_hi, p_lo); \ + /* TODO try a lazy12 variant of the next macro, */ \ + IDFT4_LAZY22(p4, p5, p6, p7, \ tab_w[2], tab_w[3], \ tab_w[4], tab_w[5], \ tab_w[6], tab_w[7], \ - mod, mod2, p_hi, p_lo, tmp); \ + mod, mod2, p_hi, p_lo); \ \ - BUTTERFLY_LAZY12(p0, p4, mod, tmp); \ - BUTTERFLY_LAZY12(p1, p5, mod, tmp); \ - BUTTERFLY_LAZY12(p2, p6, mod, tmp); \ - BUTTERFLY_LAZY12(p3, p7, mod, tmp); \ + BUTTERFLY_LAZY22(p0, p4, mod2, tmp); \ + BUTTERFLY_LAZY22(p1, p5, mod2, tmp); \ + BUTTERFLY_LAZY22(p2, p6, mod2, tmp); \ + BUTTERFLY_LAZY22(p3, p7, mod2, tmp); \ } while(0) @@ -204,6 +204,12 @@ void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) IDFT4_NODE0_LAZY12(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); } + else if (depth == 3) + { + // TODO to be improved + IDFT8_NODE0_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + F->mod, F->mod2, F->tab_w); + } else { const ulong len = UWORD(1) << depth; From c41bcd414566b4131b9c4f5c0c9d9e76a9094ab6 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 01:44:40 +0100 Subject: [PATCH 61/71] idft in progress --- src/n_fft/idft.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index 31ceec8870..a2b469943f 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -113,6 +113,7 @@ do { \ /* 8-point IDFT */ /*--------------*/ +// TODO doc // TODO see how to make lazier #define IDFT8_NODE0_LAZY14(p0, p1, p2, p3, p4, p5, p6, p7, \ mod, mod2, tab_w) \ @@ -136,6 +137,35 @@ do { \ } while(0) +/** 8-point IDFT + * TODO clean, check laziness + * * in [0..?n) / out [0..?n) / max < ?n + */ +#define DFT8_LAZY44(p0, p1, p2, p3, p4, p5, p6, p7, \ + node, mod, mod2, tab_w) \ +do { \ + ulong p_hi, p_lo, tmp; \ + \ + const ulong w = tab_w[2*(node)]; \ + const ulong w_pr = tab_w[2*(node)+1]; \ + \ + IDFT4_LAZY22(p0, p1, p2, p3, \ + tab_w[4*(node)], tab_w[4*(node)+1], \ + tab_w[8*(node)], tab_w[8*(node)+1], \ + tab_w[8*(node)+2], tab_w[8*(node)+3], \ + mod, mod2, p_hi, p_lo); \ + \ + IDFT4_LAZY22(p4, p5, p6, p7, \ + tab_w[4*(node)+2], tab_w[4*(node)+3], \ + tab_w[8*(node)+4], tab_w[8*(node)+5], \ + tab_w[8*(node)+6], tab_w[8*(node)+7], \ + mod, mod2, p_hi, p_lo); \ + \ + IDFT2_LAZY22(p0, p4, mod, mod2, w, w_pr, p_hi, p_lo, tmp); \ + IDFT2_LAZY22(p1, p5, mod, mod2, w, w_pr, p_hi, p_lo, tmp); \ + IDFT2_LAZY22(p2, p6, mod, mod2, w, w_pr, p_hi, p_lo, tmp); \ + IDFT2_LAZY22(p3, p7, mod, mod2, w, w_pr, p_hi, p_lo, tmp); \ +} while(0) @@ -171,6 +201,11 @@ void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) F->tab_w[4*node+2], F->tab_w[4*node+3], F->mod, F->mod2, p_hi, p_lo); } + else if (depth == 3) + { + DFT8_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + node, F->mod, F->mod2, F->tab_w); + } else { const ulong len = UWORD(1) << depth; From f09bca0329e79453f75bc12e34f3d957ce04da99 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 09:25:48 +0100 Subject: [PATCH 62/71] idft in progress --- src/n_fft/idft.c | 19 ++++++++++++------- src/n_fft/n_fft_macros.h | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index a2b469943f..bd10c7fda7 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -108,7 +108,6 @@ do { \ p_hi, p_lo); /* < 2*n */ \ } while(0) - /*--------------*/ /* 8-point IDFT */ /*--------------*/ @@ -120,7 +119,7 @@ do { \ do { \ ulong p_hi, p_lo, tmp; \ \ - IDFT4_NODE0_LAZY12(p0, p1, p2, p3, \ + IDFT4_NODE0_LAZY14(p0, p1, p2, p3, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ /* TODO try a lazy12 variant of the next macro, */ \ @@ -216,9 +215,12 @@ void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) const ulong w_pr = F->tab_w[2*node+1]; ulong p_hi, p_lo, tmp; - for (ulong k = 0; k < len/2; k++) + for (ulong k = 0; k < len/2; k+=4) { - IDFT2_LAZY22(p[k], p[len/2 + k], F->mod, F->mod2, w, w_pr, p_hi, p_lo, tmp); + IDFT2_LAZY22(p[k+0], p[len/2 + k+0], F->mod, F->mod2, w, w_pr, p_hi, p_lo, tmp); + IDFT2_LAZY22(p[k+1], p[len/2 + k+1], F->mod, F->mod2, w, w_pr, p_hi, p_lo, tmp); + IDFT2_LAZY22(p[k+2], p[len/2 + k+2], F->mod, F->mod2, w, w_pr, p_hi, p_lo, tmp); + IDFT2_LAZY22(p[k+3], p[len/2 + k+3], F->mod, F->mod2, w, w_pr, p_hi, p_lo, tmp); } } } @@ -236,7 +238,7 @@ void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) else if (depth == 2) { ulong p_hi, p_lo; - IDFT4_NODE0_LAZY12(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], + IDFT4_NODE0_LAZY14(p[0], p[1], p[2], p[3], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); } else if (depth == 3) @@ -252,9 +254,12 @@ void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) idft_lazy22(p+len/2, depth-1, 1, F); ulong tmp; - for (ulong k = 0; k < len/2; k++) + for (ulong k = 0; k < len/2; k+=4) { - BUTTERFLY_LAZY22(p[k], p[len/2 + k], F->mod2, tmp); + BUTTERFLY_LAZY22(p[k+0], p[len/2 + k+0], F->mod2, tmp); + BUTTERFLY_LAZY22(p[k+1], p[len/2 + k+1], F->mod2, tmp); + BUTTERFLY_LAZY22(p[k+2], p[len/2 + k+2], F->mod2, tmp); + BUTTERFLY_LAZY22(p[k+3], p[len/2 + k+3], F->mod2, tmp); } } } diff --git a/src/n_fft/n_fft_macros.h b/src/n_fft/n_fft_macros.h index 57a3bc085f..0ddb81c035 100644 --- a/src/n_fft/n_fft_macros.h +++ b/src/n_fft/n_fft_macros.h @@ -180,6 +180,24 @@ do { \ (d) -= (n2); /* < 2*n */ \ } while(0) +#define IDFT4_NODE0_LAZY14(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ +do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v1; /* < 2*n */ \ + ulong v5 = v0 + (n) - v1; /* < 2*n */ \ + ulong v6 = v2 + v3; /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n) - v3, (I_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v5 + v7; /* < 4*n */ \ + (c) = v4 + (n2) - v6; /* < 4*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ +} while(0) + /** 4-point butterfly, interpolation * * in [0..2n) / out [0..4n) / max < 4n * * other than this, same specification as IDFT4_NODE0_LAZY14 From 38a5abae84d697ba6bb3153dd1627fd6eadc4163 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 09:34:52 +0100 Subject: [PATCH 63/71] idft in progress --- src/n_fft/idft.c | 27 ++++++++++++++++++--------- src/n_fft/n_fft_macros.h | 27 ++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index bd10c7fda7..ac0539b8dd 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -119,7 +119,7 @@ do { \ do { \ ulong p_hi, p_lo, tmp; \ \ - IDFT4_NODE0_LAZY14(p0, p1, p2, p3, \ + IDFT4_NODE0_LAZY12(p0, p1, p2, p3, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ /* TODO try a lazy12 variant of the next macro, */ \ @@ -250,16 +250,25 @@ void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) else { const ulong len = UWORD(1) << depth; - idft_node0_lazy12(p, depth-1, F); - idft_lazy22(p+len/2, depth-1, 1, F); - ulong tmp; - for (ulong k = 0; k < len/2; k+=4) + // 4 recursive calls with depth-2 + const nn_ptr p0 = p; + const nn_ptr p1 = p + len/4; + const nn_ptr p2 = p + 2*len/4; + const nn_ptr p3 = p + 3*len/4; + idft_node0_lazy12(p0, depth-2, F); + idft_lazy22(p1, depth-2, 1, F); + idft_lazy22(p2, depth-2, 2, F); + idft_lazy22(p3, depth-2, 3, F); + + // 4-point butterflies + // input p0,p1,p2,p3 in ?? + // output p0,p1,p2,p3 in ?? + ulong p_hi, p_lo; + for (ulong k = 0; k < len/4; k++) { - BUTTERFLY_LAZY22(p[k+0], p[len/2 + k+0], F->mod2, tmp); - BUTTERFLY_LAZY22(p[k+1], p[len/2 + k+1], F->mod2, tmp); - BUTTERFLY_LAZY22(p[k+2], p[len/2 + k+2], F->mod2, tmp); - BUTTERFLY_LAZY22(p[k+3], p[len/2 + k+3], F->mod2, tmp); + IDFT4_NODE0_LAZY4222(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); } } } diff --git a/src/n_fft/n_fft_macros.h b/src/n_fft/n_fft_macros.h index 0ddb81c035..4817f0ceb2 100644 --- a/src/n_fft/n_fft_macros.h +++ b/src/n_fft/n_fft_macros.h @@ -217,7 +217,7 @@ do { \ ulong v6 = v2 + v3; /* < 4*n */ \ if (v6 >= (n2)) \ v6 -= (n2); /* < 2*n */ \ - N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n) - v3, (I_pr), (n), \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n2) - v3, (I_pr), (n), \ p_hi, p_lo); /* < 2*n */ \ (a) = v4 + v6; /* < 4*n */ \ (b) = v5 + v7; /* < 4*n */ \ @@ -225,6 +225,31 @@ do { \ (d) = v5 + (n2) - v7; /* < 4*n */ \ } while(0) +#define IDFT4_NODE0_LAZY4222(a, b, c, d, I, I_pr, n, n2, p_hi, p_lo) \ +do { \ + ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + if (v0 >= (n2)) \ + v0 -= (n2); /* < 2*n */ \ + ulong v4 = v0 + v1; /* < 4*n */ \ + if (v4 >= (n2)) \ + v4 -= (n2); /* < 2*n */ \ + ulong v5 = v0 + (n2) - v1; /* < 4*n */ \ + if (v5 >= (n2)) \ + v5 -= (n2); /* < 2*n */ \ + ulong v6 = v2 + v3; /* < 4*n */ \ + if (v6 >= (n2)) \ + v6 -= (n2); /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (I), v2 + (n2) - v3, (I_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ + (a) = v4 + v6; /* < 4*n */ \ + (b) = v5 + v7; /* < 4*n */ \ + (c) = v4 + (n2) - v6; /* < 4*n */ \ + (d) = v5 + (n2) - v7; /* < 4*n */ \ +} while(0) From ece93dc046523f765f99502f8d137193baa2824f Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 09:36:02 +0100 Subject: [PATCH 64/71] minor fix --- src/n_fft/ctx_init.c | 2 +- src/n_fft/idft.c | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c index 0e541c645b..43f7c6b731 100644 --- a/src/n_fft/ctx_init.c +++ b/src/n_fft/ctx_init.c @@ -145,7 +145,7 @@ void n_fft_ctx_fit_depth(n_fft_ctx_t F, ulong depth) pr_quo = F->tab_w2[2*d+1]; pr_rem = n_mulmod_precomp_shoup_rem_from_quo(pr_quo, F->mod); // for each k, tab_w[2*(k+llen)] <- ww * tab_w[2*k], and deduce precomputation - for (ulong k = 0; k+3 < llen; k+=4) + for (ulong k = 0; k < llen; k+=4) { n_mulmod_and_precomp_shoup(F->tab_w + 2*llen + 2*(k+0), F->tab_w + 2*llen + 2*(k+0)+1, ww, F->tab_w[2*(k+0)], diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index ac0539b8dd..49ceea8a5f 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -265,10 +265,16 @@ void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) // input p0,p1,p2,p3 in ?? // output p0,p1,p2,p3 in ?? ulong p_hi, p_lo; - for (ulong k = 0; k < len/4; k++) + for (ulong k = 0; k < len/4; k+=4) { IDFT4_NODE0_LAZY4222(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + IDFT4_NODE0_LAZY4222(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + IDFT4_NODE0_LAZY4222(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); + IDFT4_NODE0_LAZY4222(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], + F->mod, F->mod2, p_hi, p_lo); } } } From 9ef50f1bbf24047ca4c818592f113d5e0701d5d8 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 09:45:36 +0100 Subject: [PATCH 65/71] a bit lazier --- src/n_fft/idft.c | 65 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index 49ceea8a5f..e2e33ea3e3 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -41,7 +41,7 @@ do { \ // move in macros? // in [0..4n) x [0..2n) -> out [0..4n) x [0..4n) // TODO rename -#define BUTTERFLY_LAZY22(a, b, n2, tmp) \ +#define BUTTERFLY_LAZY4244(a, b, n2, tmp) \ do { \ tmp = (a); \ if (tmp >= (n2)) \ @@ -108,31 +108,58 @@ do { \ p_hi, p_lo); /* < 2*n */ \ } while(0) +#define IDFT4_LAZY12(a, b, c, d, \ + w1, w1_pr, w2, w2_pr, w3, w3_pr, \ + n, n2, p_hi, p_lo) \ +do { \ + const ulong v0 = (a); \ + const ulong v1 = (b); \ + const ulong v2 = (c); \ + const ulong v3 = (d); \ + ulong v4 = v0 + v1; /* < 2*n */ \ + ulong v5; \ + N_MULMOD_PRECOMP_LAZY(v5, (w2), v0 + (n) - v1, (w2_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ + ulong v6 = v2 + v3; /* < 2*n */ \ + ulong v7; \ + N_MULMOD_PRECOMP_LAZY(v7, (w3), v2 + (n) - v3, (w3_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ + \ + (a) = v4 + v6; /* < 4*n */ \ + if ((a) >= (n2)) \ + (a) -= (n2); /* < 2*n */ \ + (b) = v5 + v7; /* < 4*n */ \ + if ((b) >= (n2)) \ + (b) -= (n2); /* < 2*n */ \ + N_MULMOD_PRECOMP_LAZY((c), (w1), v4 + (n2) - v6, (w1_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ + N_MULMOD_PRECOMP_LAZY((d), (w1), v5 + (n2) - v7, (w1_pr), (n), \ + p_hi, p_lo); /* < 2*n */ \ +} while(0) + /*--------------*/ /* 8-point IDFT */ /*--------------*/ // TODO doc -// TODO see how to make lazier #define IDFT8_NODE0_LAZY14(p0, p1, p2, p3, p4, p5, p6, p7, \ mod, mod2, tab_w) \ do { \ ulong p_hi, p_lo, tmp; \ \ - IDFT4_NODE0_LAZY12(p0, p1, p2, p3, \ + IDFT4_NODE0_LAZY14(p0, p1, p2, p3, \ tab_w[2], tab_w[3], \ mod, mod2, p_hi, p_lo); \ - /* TODO try a lazy12 variant of the next macro, */ \ - IDFT4_LAZY22(p4, p5, p6, p7, \ + IDFT4_LAZY12(p4, p5, p6, p7, \ tab_w[2], tab_w[3], \ tab_w[4], tab_w[5], \ tab_w[6], tab_w[7], \ mod, mod2, p_hi, p_lo); \ \ - BUTTERFLY_LAZY22(p0, p4, mod2, tmp); \ - BUTTERFLY_LAZY22(p1, p5, mod2, tmp); \ - BUTTERFLY_LAZY22(p2, p6, mod2, tmp); \ - BUTTERFLY_LAZY22(p3, p7, mod2, tmp); \ + BUTTERFLY_LAZY4244(p0, p4, mod2, tmp); \ + BUTTERFLY_LAZY4244(p1, p5, mod2, tmp); \ + BUTTERFLY_LAZY4244(p2, p6, mod2, tmp); \ + BUTTERFLY_LAZY4244(p3, p7, mod2, tmp); \ } while(0) @@ -262,19 +289,19 @@ void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) idft_lazy22(p3, depth-2, 3, F); // 4-point butterflies - // input p0,p1,p2,p3 in ?? - // output p0,p1,p2,p3 in ?? + // input p0,p1,p2,p3 in 4 2 2 2 + // output p0,p1,p2,p3 in 4 4 4 4 ulong p_hi, p_lo; for (ulong k = 0; k < len/4; k+=4) { - IDFT4_NODE0_LAZY4222(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - IDFT4_NODE0_LAZY4222(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - IDFT4_NODE0_LAZY4222(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); - IDFT4_NODE0_LAZY4222(p0[k], p1[k], p2[k], p3[k], F->tab_w[2], F->tab_w[3], - F->mod, F->mod2, p_hi, p_lo); + IDFT4_NODE0_LAZY4222(p0[k+0], p1[k+0], p2[k+0], p3[k+0], + F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + IDFT4_NODE0_LAZY4222(p0[k+1], p1[k+1], p2[k+1], p3[k+1], + F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + IDFT4_NODE0_LAZY4222(p0[k+2], p1[k+2], p2[k+2], p3[k+2], + F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); + IDFT4_NODE0_LAZY4222(p0[k+3], p1[k+3], p2[k+3], p3[k+3], + F->tab_w[2], F->tab_w[3], F->mod, F->mod2, p_hi, p_lo); } } } From 3b5669da26720e4f12b8f548b433cba4077229d1 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 09:52:42 +0100 Subject: [PATCH 66/71] idft becoming good, remains some fine tuning to do --- src/n_fft/idft.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index e2e33ea3e3..e154d52245 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -211,6 +211,8 @@ do { \ /*--------------*/ +// TODO doc +// TODO add lazy12? void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) { if (depth == 1) @@ -235,19 +237,31 @@ void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) else { const ulong len = UWORD(1) << depth; - idft_lazy22(p, depth-1, 2*node, F); - idft_lazy22(p+len/2, depth-1, 2*node+1, F); - const ulong w = F->tab_w[2*node]; - const ulong w_pr = F->tab_w[2*node+1]; - ulong p_hi, p_lo, tmp; + // 4 recursive calls with depth-2 + const nn_ptr p0 = p; + const nn_ptr p1 = p + len/4; + const nn_ptr p2 = p + 2*len/4; + const nn_ptr p3 = p + 3*len/4; + idft_lazy22(p0, depth-2, 4*node, F); + idft_lazy22(p1, depth-2, 4*node+1, F); + idft_lazy22(p2, depth-2, 4*node+2, F); + idft_lazy22(p3, depth-2, 4*node+3, F); + + const ulong w2 = F->tab_w[2*node]; + const ulong w2_pr = F->tab_w[2*node+1]; + const ulong w = F->tab_w[4*node]; + const ulong w_pr = F->tab_w[4*node+1]; + const ulong Iw = F->tab_w[4*node+2]; + const ulong Iw_pr = F->tab_w[4*node+3]; + ulong p_hi, p_lo; - for (ulong k = 0; k < len/2; k+=4) + for (ulong k = 0; k < len/4; k+=4) { - IDFT2_LAZY22(p[k+0], p[len/2 + k+0], F->mod, F->mod2, w, w_pr, p_hi, p_lo, tmp); - IDFT2_LAZY22(p[k+1], p[len/2 + k+1], F->mod, F->mod2, w, w_pr, p_hi, p_lo, tmp); - IDFT2_LAZY22(p[k+2], p[len/2 + k+2], F->mod, F->mod2, w, w_pr, p_hi, p_lo, tmp); - IDFT2_LAZY22(p[k+3], p[len/2 + k+3], F->mod, F->mod2, w, w_pr, p_hi, p_lo, tmp); + IDFT4_LAZY22(p0[k+0], p1[k+0], p2[k+0], p3[k+0], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2, p_hi, p_lo); + IDFT4_LAZY22(p0[k+1], p1[k+1], p2[k+1], p3[k+1], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2, p_hi, p_lo); + IDFT4_LAZY22(p0[k+2], p1[k+2], p2[k+2], p3[k+2], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2, p_hi, p_lo); + IDFT4_LAZY22(p0[k+3], p1[k+3], p2[k+3], p3[k+3], w2, w2_pr, w, w_pr, Iw, Iw_pr, F->mod, F->mod2, p_hi, p_lo); } } } From 614dacb95287791391c75a59897f4661d64cd185 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 10:26:10 +0100 Subject: [PATCH 67/71] a bit lazier --- src/n_fft/idft.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index e154d52245..1f9788bc87 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -167,7 +167,7 @@ do { \ * TODO clean, check laziness * * in [0..?n) / out [0..?n) / max < ?n */ -#define DFT8_LAZY44(p0, p1, p2, p3, p4, p5, p6, p7, \ +#define DFT8_LAZY14(p0, p1, p2, p3, p4, p5, p6, p7, \ node, mod, mod2, tab_w) \ do { \ ulong p_hi, p_lo, tmp; \ @@ -175,13 +175,13 @@ do { \ const ulong w = tab_w[2*(node)]; \ const ulong w_pr = tab_w[2*(node)+1]; \ \ - IDFT4_LAZY22(p0, p1, p2, p3, \ + IDFT4_LAZY12(p0, p1, p2, p3, \ tab_w[4*(node)], tab_w[4*(node)+1], \ tab_w[8*(node)], tab_w[8*(node)+1], \ tab_w[8*(node)+2], tab_w[8*(node)+3], \ mod, mod2, p_hi, p_lo); \ \ - IDFT4_LAZY22(p4, p5, p6, p7, \ + IDFT4_LAZY12(p4, p5, p6, p7, \ tab_w[4*(node)+2], tab_w[4*(node)+3], \ tab_w[8*(node)+4], tab_w[8*(node)+5], \ tab_w[8*(node)+6], tab_w[8*(node)+7], \ @@ -213,7 +213,7 @@ do { \ // TODO doc // TODO add lazy12? -void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) +void idft_lazy12(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) { if (depth == 1) { @@ -223,7 +223,7 @@ void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) else if (depth == 2) { ulong p_hi, p_lo; - IDFT4_LAZY22(p[0], p[1], p[2], p[3], + IDFT4_LAZY12(p[0], p[1], p[2], p[3], F->tab_w[2*node], F->tab_w[2*node+1], F->tab_w[4*node], F->tab_w[4*node+1], F->tab_w[4*node+2], F->tab_w[4*node+3], @@ -231,7 +231,7 @@ void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) } else if (depth == 3) { - DFT8_LAZY44(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + DFT8_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], node, F->mod, F->mod2, F->tab_w); } else @@ -243,10 +243,10 @@ void idft_lazy22(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) const nn_ptr p1 = p + len/4; const nn_ptr p2 = p + 2*len/4; const nn_ptr p3 = p + 3*len/4; - idft_lazy22(p0, depth-2, 4*node, F); - idft_lazy22(p1, depth-2, 4*node+1, F); - idft_lazy22(p2, depth-2, 4*node+2, F); - idft_lazy22(p3, depth-2, 4*node+3, F); + idft_lazy12(p0, depth-2, 4*node, F); + idft_lazy12(p1, depth-2, 4*node+1, F); + idft_lazy12(p2, depth-2, 4*node+2, F); + idft_lazy12(p3, depth-2, 4*node+3, F); const ulong w2 = F->tab_w[2*node]; const ulong w2_pr = F->tab_w[2*node+1]; @@ -298,13 +298,13 @@ void idft_node0_lazy12(nn_ptr p, ulong depth, n_fft_args_t F) const nn_ptr p2 = p + 2*len/4; const nn_ptr p3 = p + 3*len/4; idft_node0_lazy12(p0, depth-2, F); - idft_lazy22(p1, depth-2, 1, F); - idft_lazy22(p2, depth-2, 2, F); - idft_lazy22(p3, depth-2, 3, F); + idft_lazy12(p1, depth-2, 1, F); + idft_lazy12(p2, depth-2, 2, F); + idft_lazy12(p3, depth-2, 3, F); // 4-point butterflies - // input p0,p1,p2,p3 in 4 2 2 2 - // output p0,p1,p2,p3 in 4 4 4 4 + // input p0 in [0,4n), p1,p2,p3 in [0,2n) + // output p0,p1,p2,p3 in [0,4n) ulong p_hi, p_lo; for (ulong k = 0; k < len/4; k+=4) { From 9567b159f24f4d194edc7c9e3de6b5bbe4793059 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Mon, 28 Oct 2024 10:26:35 +0100 Subject: [PATCH 68/71] fix name --- src/n_fft/idft.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index 1f9788bc87..ad36e206a3 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -167,7 +167,7 @@ do { \ * TODO clean, check laziness * * in [0..?n) / out [0..?n) / max < ?n */ -#define DFT8_LAZY14(p0, p1, p2, p3, p4, p5, p6, p7, \ +#define DFT8_LAZY12(p0, p1, p2, p3, p4, p5, p6, p7, \ node, mod, mod2, tab_w) \ do { \ ulong p_hi, p_lo, tmp; \ @@ -231,7 +231,7 @@ void idft_lazy12(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) } else if (depth == 3) { - DFT8_LAZY14(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], + DFT8_LAZY12(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], node, F->mod, F->mod2, F->tab_w); } else From c438c43207ab29a022f994aa4cabbabdbea7406e Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sat, 9 Nov 2024 11:03:36 +0100 Subject: [PATCH 69/71] minor comment --- src/n_fft/idft.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index ad36e206a3..4249d47b50 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -28,6 +28,7 @@ * by 1/2, since the inverse of [1 iw] is 1/2 * [1 1] * [1 -iw] [w -w] */ +// TODO make order of arguments consistent #define IDFT2_LAZY22(a, b, n, n2, w, w_pr, p_hi, p_lo, tmp) \ do { \ tmp = (a) + (n2) - (b); /* [0..4n) */ \ @@ -212,7 +213,6 @@ do { \ // TODO doc -// TODO add lazy12? void idft_lazy12(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) { if (depth == 1) From 4bb083a9f425fb2572d34eb32e10c86d210a263e Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sat, 9 Nov 2024 15:38:14 +0100 Subject: [PATCH 70/71] remove unnecessary includes --- src/n_fft.h | 3 --- src/n_fft/ctx_init.c | 2 -- src/n_fft/dft.c | 1 - src/n_fft/idft.c | 1 - 4 files changed, 7 deletions(-) diff --git a/src/n_fft.h b/src/n_fft.h index e3b8458669..6676fe9fd2 100644 --- a/src/n_fft.h +++ b/src/n_fft.h @@ -12,9 +12,6 @@ #ifndef N_FFT_H #define N_FFT_H -#include "flint.h" -#include "nmod.h" -#include "nmod_vec.h" #include "ulong_extras.h" #define N_FFT_CTX_DEFAULT_DEPTH 12 diff --git a/src/n_fft/ctx_init.c b/src/n_fft/ctx_init.c index 43f7c6b731..d05a5f9463 100644 --- a/src/n_fft/ctx_init.c +++ b/src/n_fft/ctx_init.c @@ -9,8 +9,6 @@ (at your option) any later version. See . */ -#include "flint.h" -#include "ulong_extras.h" #include "n_fft.h" /** Given the precomputed quotient a_pr for modular multiplication by a mod n, diff --git a/src/n_fft/dft.c b/src/n_fft/dft.c index f2e6b72987..97205418e4 100644 --- a/src/n_fft/dft.c +++ b/src/n_fft/dft.c @@ -9,7 +9,6 @@ (at your option) any later version. See . */ -#include "longlong.h" #include "n_fft.h" #include "n_fft_macros.h" diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index 4249d47b50..875f1caf22 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -9,7 +9,6 @@ (at your option) any later version. See . */ -#include "longlong.h" #include "n_fft.h" #include "n_fft_macros.h" From 99284b598fe607430c738e134238386c34bd0af8 Mon Sep 17 00:00:00 2001 From: Vincent Neiger Date: Sat, 9 Nov 2024 19:20:33 +0100 Subject: [PATCH 71/71] add todo about tests idft any node small depths --- src/n_fft/idft.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/n_fft/idft.c b/src/n_fft/idft.c index 875f1caf22..50036b3c3d 100644 --- a/src/n_fft/idft.c +++ b/src/n_fft/idft.c @@ -212,6 +212,7 @@ do { \ // TODO doc +// TODO make sure this is tested (code coverage: including for small depths) void idft_lazy12(nn_ptr p, ulong depth, ulong node, n_fft_args_t F) { if (depth == 1)