diff --git a/.bazeliskrc b/.bazeliskrc
new file mode 100644
index 000000000..f4c188428
--- /dev/null
+++ b/.bazeliskrc
@@ -0,0 +1 @@
+USE_BAZEL_VERSION=7.4.1
\ No newline at end of file
diff --git a/.bazelrc b/.bazelrc
index 7ba093994..6792b5cd2 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -15,6 +15,10 @@
 common --experimental_repo_remote_exec
 common --experimental_cc_shared_library
 
+common --registry=https://raw.githubusercontent.com/secretflow/bazel-registry/main
+common --registry=https://bcr.bazel.build
+common --registry=https://baidu.github.io/babylon/registry
+
 # Required by OpenXLA
 build --nocheck_visibility
 
@@ -64,3 +68,6 @@ build:macos --action_env MACOSX_DEPLOYMENT_TARGET=13.0
 
 build:linux --copt=-fopenmp
 build:linux --linkopt=-fopenmp
+
+# NOTE: fix build on linux_aarch64, since `build:linux_aarch64` works only if `--config linux_aarch64` option included.
+build:linux --cxxopt=-Wno-mismatched-new-delete
diff --git a/.bazelversion b/.bazelversion
deleted file mode 100644
index f22d756da..000000000
--- a/.bazelversion
+++ /dev/null
@@ -1 +0,0 @@
-6.5.0
diff --git a/.circleci/asan-config.yml b/.circleci/asan-config.yml
index 901cad5de..fb1237402 100644
--- a/.circleci/asan-config.yml
+++ b/.circleci/asan-config.yml
@@ -46,10 +46,6 @@ jobs:
       - run:
           name: Checkout devtools
           command: git clone https://github.com/secretflow/devtools.git ../devtools
-      - run:
-          name: "Install dependencies"
-          command: |
-            python3 -m pip install numpy
       - run:
           name: "test"
           command: |
diff --git a/.circleci/benchmark-config.yml b/.circleci/benchmark-config.yml
index 8a4ddad37..82eab8c37 100644
--- a/.circleci/benchmark-config.yml
+++ b/.circleci/benchmark-config.yml
@@ -47,8 +47,6 @@ jobs:
                 "python3 -m pip install -U pip && \
                    python3 -m pip install spu && \
                    cd /home/admin/dev/ && \
-                   python3 -m pip install -r requirements-dev.txt && \
-                   python3 -m pip install -r examples/python/ml/requirements.txt && \
                    export PYTHONPATH="${PWD}:$PYTHONPATH" && \
                    bash .circleci/run-nn.sh" | tee benchmark_results.log
       - run:
diff --git a/.circleci/continue-config.yml b/.circleci/continue-config.yml
index 50ff328c9..5b0669ff8 100644
--- a/.circleci/continue-config.yml
+++ b/.circleci/continue-config.yml
@@ -52,11 +52,6 @@ commands:
       - run:
           name: Checkout devtools
           command: git clone https://github.com/secretflow/devtools.git ../devtools
-      - run:
-          name: "Install dependencies"
-          command: |
-            python3 -m pip install -r requirements.txt
-            python3 -m pip install -r requirements-dev.txt
       - run:
           name: Setup GCS
           command: |
@@ -64,7 +59,10 @@ commands:
             ../devtools/bazel_cache_setup.py --in_file=../gcs.data --out_file=../gcs.json --min_download
       - run:
           name: "build"
-          command: bazel build <<parameters.targets>> -c opt --ui_event_filters=-info,-debug,-warning
+          command: |
+            bazel --version && python3 --version
+            if [ ! -e "/usr/bin/python3" ]; then ln -s `which python3` /usr/bin/python3; fi
+            bazel run //:requirements-dev.update && bazel build <<parameters.targets>> -c opt --ui_event_filters=-info,-debug,-warning
       - run:
           name: "test"
           command: |
@@ -114,7 +112,6 @@ jobs:
           name: Install extra deps
           command: |
             python3 -m pip install tzdata
-            python3 -m pip install -r examples/python/ml/requirements.txt
       - build_and_test:
           targets: //examples/python/ml:ml_test
           extra_bazel_test_args: --test_env LD_LIBRARY_PATH=/root/miniconda3/lib/
@@ -134,8 +131,6 @@ jobs:
             bash ~/miniconda.sh -b -p $HOME/miniconda
             source $HOME/miniconda/bin/activate
             conda init bash zsh
-            pip install -r requirements.txt
-            pip install -r requirements-dev.txt
       - build_and_test
 # Invoke jobs via workflows
 # See: https://circleci.com/docs/2.0/configuration-reference/#workflows
diff --git a/.circleci/release-config.yml b/.circleci/release-config.yml
index 762fa5999..fdc689b6a 100644
--- a/.circleci/release-config.yml
+++ b/.circleci/release-config.yml
@@ -52,11 +52,11 @@ commands:
             conda create -n build python=<< parameters.python_ver >> -y
             conda activate build
 
-            sh ./build_wheel_entrypoint.sh
+            bazel build //:spu_wheel -c opt --@rules_python//python/config_settings:python_version=<< parameters.python_ver >>
             python3 -m pip install twine
-            ls dist/*.whl
+            ls bazel-bin/spu*.whl
 
-            python3 -m twine upload -r pypi -u __token__ -p ${PYPI_TWINE_TOKEN} dist/*.whl
+            python3 -m twine upload -r pypi -u __token__ -p ${PYPI_TWINE_TOKEN} bazel-bin/spu*.whl
 
 # Define a job to be invoked later in a workflow.
 # See: https://circleci.com/docs/2.0/configuration-reference/#jobs
@@ -103,7 +103,7 @@ workflows:
       - linux_publish:
           matrix:
             parameters:
-              python_ver: ["3.9", "3.10", "3.11"]
+              python_ver: ["3.10", "3.11"]
               executor: ["linux_x64_executor", "linux_aarch64_executor"]
           filters:
             tags:
@@ -111,7 +111,7 @@ workflows:
       - macOS_publish:
           matrix:
             parameters:
-              python_ver: ["3.9", "3.10", "3.11"]
+              python_ver: ["3.10", "3.11"]
           filters:
             tags:
               only: /.*(?<!dev\d{8})$/
diff --git a/.gitignore b/.gitignore
index 2f2f2bbbf..55c0e8544 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ _build
 
 # bazel
 /bazel-*
+MODULE.bazel.lock
 
 # cmake related
 abseil-cpp
diff --git a/.licenserc.yaml b/.licenserc.yaml
index 92b3ce591..746e2429c 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -43,14 +43,14 @@ header: # <1>
     - 'libspu/compiler/tests/interpret/template/**.template'
     - 'LICENSE'
     - 'NOTICE'
-    - '.bazelversion'
+    - 'MODULE.bazel.lock'
+    - '.bazeliskrc'
     - '.clang-format'
     - '.clang-tidy'
     - '.gitattributes'
     - '.gitignore'
     - '.gitmodules'
     - 'pyproject.toml'
-    - 'setup.cfg'
     - 'libspu/core/half.h' # MIT
 
   comment: never # <9>
diff --git a/BUILD.bazel b/BUILD.bazel
index 7a517cec5..6a5b98642 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -11,3 +11,114 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+load("@python_versions//3.10:defs.bzl", compile_pip_requirements_3_10 = "compile_pip_requirements")
+load("@python_versions//3.11:defs.bzl", compile_pip_requirements_3_11 = "compile_pip_requirements")
+
+# load("@python_versions//3.9:defs.bzl", compile_pip_requirements_3_9 = "compile_pip_requirements")
+load("@rules_python//python:packaging.bzl", "py_package", "py_wheel")
+load("@rules_python//python:pip.bzl", "compile_pip_requirements")
+load("//:version.bzl", "SPU_VERSION")
+
+# compile_pip_requirements_3_9(
+#     name = "requirements_3_9",
+#     src = "requirements_3_9.txt",
+#     requirements_txt = "requirements_lock_3_9.txt",
+#     tags = ["manual"],
+# )
+
+compile_pip_requirements_3_10(
+    name = "requirements_3_10",
+    src = "requirements.txt",
+    requirements_txt = "requirements_lock_3_10.txt",
+    tags = ["manual"],
+)
+
+compile_pip_requirements_3_11(
+    name = "requirements_3_11",
+    src = "requirements.txt",
+    requirements_txt = "requirements_lock_3_11.txt",
+    tags = ["manual"],
+)
+
+compile_pip_requirements(
+    name = "requirements-dev",
+    src = "requirements-dev.txt",
+    requirements_txt = "requirements_dev_lock.txt",
+    tags = ["manual"],
+)
+
+exports_files([
+    "README.md",
+])
+
+# https://rules-python.readthedocs.io/en/latest/api/rules_python/python/packaging.html#py_wheel_rule
+py_wheel(
+    name = "spu_wheel",
+    abi = select(
+        {
+            "@rules_python//python/config_settings:is_python_3.10": "cp310",
+            "@rules_python//python/config_settings:is_python_3.11": "cp311",
+            # "@rules_python//python/config_settings:is_python_3.9": "cp39",
+            "//conditions:default": "none",
+        },
+    ),
+    author = "SecretFlow Team",
+    author_email = "secretflow-contact@service.alipay.com",
+    classifiers = [
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+    ],
+    description_content_type = "text/markdown",
+    description_file = "README.md",
+    distribution = "spu",
+    extra_distinfo_files = {
+        "//:LICENSE": "LICENSE",
+    },
+    homepage = "https://github.com/secretflow/spu",
+    license = "Apache License 2.0",
+    # TODO: add other fields.
+    platform = select(
+        {
+            "@bazel_tools//src/conditions:linux_x86_64": "manylinux2014_x86_64",
+            "@bazel_tools//src/conditions:darwin_arm64": "macosx_13_0_arm64",
+            "@bazel_tools//src/conditions:linux_aarch64": "manylinux_2_28_aarch64",
+            "//conditions:default": "any",
+        },
+    ),
+    python_requires = ">=3.9",
+    python_tag = select(
+        {
+            "@rules_python//python/config_settings:is_python_3.10": "cp310",
+            "@rules_python//python/config_settings:is_python_3.11": "cp311",
+            # "@rules_python//python/config_settings:is_python_3.9": "cp39",
+            "//conditions:default": "py3",
+        },
+    ),
+    requires_file = "requirements.txt",
+    summary = "SPU aims to be a 'provable', 'measurable' secure computation device.",
+    tags = ["manual"],
+    twine = None,
+    version = SPU_VERSION,
+    deps = [
+        ":spu_pkg",
+    ],
+)
+
+py_package(
+    name = "spu_pkg",
+    packages = [
+        "libspu",
+        "spu",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//spu:api",
+        "//spu:init",
+        "//spu:libpsi",
+        "//spu:libspu",
+        "//spu/ops/groupby",
+        "//spu/utils:distributed",
+    ],
+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 97ff36b09..428fda2f0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,12 @@
 >
 > please add your unreleased change here.
 
+- [SPU] Migrating to Bazel Modules and update minimum Python version to 3.10.
+- [Feature] Add soPRF (LowMC cipher) for SEMI2K
+- [API] Add Permute/InvPermute support in HLO
+- [Feature] Add SSL configuration to the TTP server
+- [Feature] Support quick sort for semi2k and aby3
+
 ## 20241219
 
 - [SPU] 0.9.3b0 release
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9701c407b..ef2743e3d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -62,13 +62,6 @@ docker exec -it spu-dev-$(whoami) bash
 Install gcc>=11.2, cmake>=3.26, ninja, nasm>=2.15, python>=3.9, bazelisk, xxd, lld
 ```
 
-About the commands used to install the above dependencies, you can follow [Ubuntu docker file](https://github.com/secretflow/devtools/blob/main/dockerfiles/ubuntu-base-ci.DockerFile).
-
-```sh
-python3 -m pip install -r requirements.txt
-python3 -m pip install -r requirements-dev.txt
-```
-
 #### macOS
 
 ```sh
@@ -90,10 +83,6 @@ brew install bazelisk cmake ninja libomp wget
 
 # For Intel mac only
 brew install nasm
-
-# Install python dependencies
-pip install -r requirements.txt
-pip install -r requirements-dev.txt
 ```
 
 ### Build & UnitTest
@@ -117,6 +106,7 @@ bazel test //... --features=ubsan
 
 - `--define gperf=on` enable gperf
 - `--define tracelog=on` enable link trace log.
+- `--@rules_python//python/config_settings:python_version=3.10` set the Python version as 3.10, the default version is 3.11
 
 ### Build docs
 
diff --git a/INSTALLATION.md b/INSTALLATION.md
index 660baa8f1..de416aaa0 100644
--- a/INSTALLATION.md
+++ b/INSTALLATION.md
@@ -38,8 +38,8 @@ pip install spu
 - At the root of repo, run
 
 ```bash
-python setup.py bdist_wheel
-pip install dist/*.whl --force-reinstall
+bazel build //:spu_wheel -c opt
+pip install bazel-bin/spu-*.whl --force-reinstall
 ```
 
 - Once GCC/bazel/python/Xcode version or other environment settings have changed, please run the following command to ensure a clean build
@@ -51,5 +51,15 @@ bazel clean --expunge
 #### Build with GPU support
 
 ```bash
-export ENABLE_GPU_BUILD=1 && python setup.py bdist_wheel
+bazel build //:spu_wheel -c opt --config=gpu
+```
+
+#### Build with specified python version
+
+```bash
+# build with python 3.10
+bazel build //:spu_wheel -c opt --@rules_python//python/config_settings:python_version=3.10
+
+# build with python 3.11
+bazel build //:spu_wheel -c opt --@rules_python//python/config_settings:python_version=3.11
 ```
diff --git a/MODULE.bazel b/MODULE.bazel
new file mode 100644
index 000000000..2552234e8
--- /dev/null
+++ b/MODULE.bazel
@@ -0,0 +1,164 @@
+# Copyright 2024 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###############################################################################
+# Bazel now uses Bzlmod by default to manage external dependencies.
+# Please consider migrating your external dependencies from WORKSPACE to MODULE.bazel.
+#
+# For more details, please check https://github.com/bazelbuild/bazel/issues/18958
+###############################################################################
+
+module(
+    name = "spulib",
+    version = "0.9.4",
+    compatibility_level = 1,
+)
+
+bazel_dep(name = "grpc")
+single_version_override(
+    module_name = "grpc",
+    patch_strip = 1,
+    patches = [
+        "//bazel/patches:grpc-1.66.patch",
+        "//bazel/patches:grpc-module-file.patch",
+    ],
+    version = "1.66.0.bcr.3",
+)
+
+bazel_dep(name = "protobuf", version = "27.3")
+single_version_override(
+    module_name = "protobuf",
+    patch_strip = 1,
+    patches = [
+        "//bazel/patches:protobuf-xla.patch",
+    ],
+    version = "27.3",
+)
+
+bazel_dep(name = "bazel_skylib", version = "1.7.1")
+bazel_dep(name = "apple_support", version = "1.17.1")
+bazel_dep(name = "rules_cc", version = "0.0.12")
+bazel_dep(name = "rules_cuda", version = "0.2.3")
+bazel_dep(name = "rules_foreign_cc", version = "0.12.0")
+bazel_dep(name = "bazel_features", version = "1.20.0")
+bazel_dep(name = "platforms", version = "0.0.8")
+bazel_dep(name = "pybind11_bazel", version = "2.13.6")
+bazel_dep(name = "rules_python", version = "0.29.0")
+bazel_dep(name = "rules_proto", version = "6.0.0-rc1")
+bazel_dep(name = "spdlog", version = "1.14.1")
+bazel_dep(name = "fmt", version = "11.0.2")
+bazel_dep(name = "abseil-cpp", version = "20240722.0")
+
+DEFAULT_PYTHON_VERSION = "3.11"
+
+SUPPORTED_PYTHON_VERSIONS = [
+    # "3.9",
+    "3.10",
+    "3.11",
+]
+
+python = use_extension("@rules_python//python/extensions:python.bzl", "python")
+
+[
+    python.toolchain(
+        ignore_root_user_error = True,
+        is_default = python_version == DEFAULT_PYTHON_VERSION,
+        python_version = python_version,
+    )
+    for python_version in SUPPORTED_PYTHON_VERSIONS
+]
+
+use_repo(python, "python_versions")
+
+pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip")
+
+[
+    pip.parse(
+        hub_name = "spu_pip",
+        python_version = python_version,
+        requirements_lock = "//:requirements_lock_{}.txt".format(python_version.replace(".", "_")),
+    )
+    for python_version in SUPPORTED_PYTHON_VERSIONS
+]
+
+use_repo(pip, "spu_pip")
+pip.parse(
+    hub_name = "spu_pip_dev",
+    python_version = DEFAULT_PYTHON_VERSION,
+    requirements_lock = "//:requirements_dev_lock.txt",
+)
+pip.override(
+    file = "torch-2.3.0-cp311-cp311-manylinux1_x86_64.whl",
+    patch_strip = 1,
+    patches = [
+        # FIXME: https://github.com/pytorch/pytorch/issues/117350
+        "//bazel/patches:pytorch.patch",
+        "//bazel/patches:pytorch_record.patch",
+    ],
+)
+use_repo(pip, "spu_pip_dev")
+
+# --registry=https://baidu.github.io/babylon/registry
+bazel_dep(name = "leveldb", version = "1.23")
+
+# self-hosted registry
+bazel_dep(name = "eigen", version = "3.4.90-20230801-66e8f3")
+bazel_dep(name = "emp-tool", version = "0.2.5")
+bazel_dep(name = "emp-ot", version = "0.2.4")
+bazel_dep(name = "brpc", version = "1.11.0-20241212-282bc90")
+bazel_dep(name = "seal", version = "4.1.1")
+bazel_dep(name = "cutlass", version = "3.5.1")
+bazel_dep(name = "llvm-raw", version = "20240809.0-35f55f5")
+bazel_dep(name = "sse2neon", version = "1.7.0-20240330-8df2f48")
+
+llvm = use_extension("@llvm-raw//utils/bazel:extension.bzl", "llvm")
+llvm.configure(
+    targets = [
+        "AArch64",
+        "X86",
+        "ARM",
+    ],
+)
+use_repo(llvm, "llvm-project")
+
+bazel_dep(name = "stablehlo", version = "20240808.0-24d1807")
+bazel_dep(name = "xla", version = "20240814.0-64bdcc5")
+bazel_dep(name = "yacl", version = "20241212.0-871832a")
+bazel_dep(name = "psi")
+git_override(
+    module_name = "psi",
+    commit = "8ead92f1bb10329c7e7e56d541fecb3dcd47ee03",
+    remote = "https://github.com/secretflow/psi.git",
+)
+
+spu_dependencies = use_extension("//bazel:defs.bzl", "non_module_dependencies")
+use_repo(spu_dependencies, "xtensor")
+
+new_local_repository = use_repo_rule("@bazel_tools//tools/build_defs/repo:local.bzl", "new_local_repository")
+
+new_local_repository(
+    name = "macos_omp_x64",
+    build_file = "@yacl//bazel:local_openmp_macos.BUILD",
+    path = "/usr/local/opt/libomp",
+)
+
+new_local_repository(
+    name = "macos_omp_arm64",
+    build_file = "@yacl//bazel:local_openmp_macos.BUILD",
+    path = "/opt/homebrew/opt/libomp/",
+)
+
+# test
+bazel_dep(name = "googletest", version = "1.15.2", dev_dependency = True)
+bazel_dep(name = "google_benchmark", version = "1.8.5", dev_dependency = True)
diff --git a/WORKSPACE b/WORKSPACE
deleted file mode 100644
index 8a40fb588..000000000
--- a/WORKSPACE
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright 2021 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-workspace(name = "spulib")
-
-load("//bazel:repositories.bzl", "spu_deps")
-
-spu_deps()
-
-#
-# yacl
-# Warning: SPU relies on yacl to bring in common 3p libraries.
-#          Please make sure yacl_deps are called right after spu_deps.
-#
-load("@yacl//bazel:repositories.bzl", "yacl_deps")
-
-yacl_deps()
-
-load("@psi//bazel:repositories.bzl", "psi_deps")
-
-psi_deps()
-
-load("@rules_python//python:repositories.bzl", "py_repositories")
-
-py_repositories()
-
-load("@pybind11_bazel//:python_configure.bzl", "python_configure")
-
-python_configure(
-    name = "local_config_python",
-    python_version = "3",
-)
-
-load(
-    "@rules_foreign_cc//foreign_cc:repositories.bzl",
-    "rules_foreign_cc_dependencies",
-)
-
-rules_foreign_cc_dependencies(
-    register_built_tools = False,
-    register_default_tools = False,
-    register_preinstalled_tools = True,
-)
-
-load("@bazel_features//:deps.bzl", "bazel_features_deps")
-
-bazel_features_deps()
-
-load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains", "rules_cuda_dependencies")
-
-rules_cuda_dependencies()
-
-register_detected_cuda_toolchains()
-
-load("@xla//:workspace4.bzl", "xla_workspace4")
-
-xla_workspace4()
-
-load("@xla//:workspace3.bzl", "xla_workspace3")
-
-xla_workspace3()
-
-load("@xla//:workspace2.bzl", "xla_workspace2")
-
-xla_workspace2()
-
-load("@xla//:workspace1.bzl", "xla_workspace1")
-
-xla_workspace1()
-
-load("@xla//:workspace0.bzl", "xla_workspace0")
-
-xla_workspace0()
-
-load("@rules_proto_grpc//:repositories.bzl", "rules_proto_grpc_repos", "rules_proto_grpc_toolchains")
-
-rules_proto_grpc_toolchains()
-
-rules_proto_grpc_repos()
-
-#
-# boost
-#
-load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps")
-
-boost_deps()
diff --git a/bazel/local_openmp_macos.BUILD b/bazel/defs.bzl
similarity index 66%
rename from bazel/local_openmp_macos.BUILD
rename to bazel/defs.bzl
index 82d976b8b..d5037417e 100644
--- a/bazel/local_openmp_macos.BUILD
+++ b/bazel/defs.bzl
@@ -1,4 +1,4 @@
-# Copyright 2022 Ant Group Co., Ltd.
+# Copyright 2024 Ant Group Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,16 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_cc//cc:defs.bzl", "cc_library")
+load("//bazel:repositories.bzl", "spu_deps")
 
-cc_library(
-    name = "openmp",
-    srcs = [
-        "lib/libomp.a",
-    ],
-    hdrs = ["include/omp.h"],
-    includes = [
-        "include/",
-    ],
-    visibility = ["//visibility:public"],
+def _non_module_dependencies_impl(_ctx):
+    spu_deps()
+
+non_module_dependencies = module_extension(
+    implementation = _non_module_dependencies_impl,
 )
diff --git a/bazel/eigen.BUILD b/bazel/eigen.BUILD
deleted file mode 100644
index 80ccf3ca6..000000000
--- a/bazel/eigen.BUILD
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# copy from tf:
-# https://raw.githubusercontent.com/tensorflow/tensorflow/master/third_party/eigen.BUILD
-#
-# Description:
-#   Eigen is a C++ template library for linear algebra: vectors,
-#   matrices, and related algorithms.
-
-load("@rules_cc//cc:defs.bzl", "cc_library")
-load("@yacl//bazel:yacl.bzl", "OMP_DEPS")
-
-licenses([
-    # Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code.
-    #       We've taken special care to not reference any restricted code.
-    "reciprocal",  # MPL2
-    "notice",  # Portions BSD
-])
-
-exports_files(["COPYING.MPL2"])
-
-EIGEN_FILES = [
-    "Eigen/**",
-    "unsupported/Eigen/CXX11/**",
-    "unsupported/Eigen/FFT",
-    "unsupported/Eigen/KroneckerProduct",
-    "unsupported/Eigen/src/FFT/**",
-    "unsupported/Eigen/src/KroneckerProduct/**",
-    "unsupported/Eigen/MatrixFunctions",
-    "unsupported/Eigen/SpecialFunctions",
-    "unsupported/Eigen/src/MatrixFunctions/**",
-    "unsupported/Eigen/src/SpecialFunctions/**",
-]
-
-# Files known to be under MPL2 license.
-EIGEN_MPL2_HEADER_FILES = glob(
-    EIGEN_FILES,
-    exclude = [
-        # Guarantees that any non-MPL2 file added to the list above will fail to
-        # compile.
-        "Eigen/src/Core/util/NonMPL2.h",
-        "Eigen/**/CMakeLists.txt",
-    ],
-)
-
-cc_library(
-    name = "eigen3",
-    hdrs = EIGEN_MPL2_HEADER_FILES,
-    defines = [
-        # This define (mostly) guarantees we don't link any problematic
-        # code. We use it, but we do not rely on it, as evidenced above.
-        "EIGEN_MPL2_ONLY",
-    ],
-    includes = ["."],
-    visibility = ["//visibility:public"],
-    deps = OMP_DEPS,
-)
-
-filegroup(
-    name = "eigen_header_files",
-    srcs = EIGEN_MPL2_HEADER_FILES,
-    visibility = ["//visibility:public"],
-)
diff --git a/bazel/emp-ot.BUILD b/bazel/emp-ot.BUILD
deleted file mode 100644
index 4e3814729..000000000
--- a/bazel/emp-ot.BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@spulib//bazel:spu.bzl", "spu_cmake_external")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all_srcs",
-    srcs = glob(["**"]),
-)
-
-spu_cmake_external(
-    name = "emp-ot",
-    cache_entries = {
-        "CMAKE_FOLDER": "$EXT_BUILD_DEPS/emp-tool",
-        "EMP-TOOL_INCLUDE_DIR": "$EXT_BUILD_DEPS/emp-tool/include",
-        "EMP-TOOL_LIBRARY": "$EXT_BUILD_DEPS/emp-tool/lib",
-        "OPENSSL_ROOT_DIR": "$EXT_BUILD_DEPS/openssl",
-        "BUILD_TESTING": "OFF",
-    },
-    lib_source = ":all_srcs",
-    out_headers_only = True,
-    deps = [
-        "@com_github_emptoolkit_emp_tool//:emp-tool",
-        "@com_github_openssl_openssl//:openssl",
-    ],
-)
diff --git a/bazel/emp-tool.BUILD b/bazel/emp-tool.BUILD
deleted file mode 100644
index 034df99dc..000000000
--- a/bazel/emp-tool.BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@spulib//bazel:spu.bzl", "spu_cmake_external")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all_srcs",
-    srcs = glob(["**"]),
-)
-
-spu_cmake_external(
-    name = "emp-tool",
-    cache_entries = {
-        "OPENSSL_ROOT_DIR": "$EXT_BUILD_DEPS/openssl",
-        "BUILD_TESTING": "OFF",
-    },
-    lib_source = ":all_srcs",
-    out_data_dirs = ["cmake"],
-    out_static_libs = [
-        "libemp-tool.a",
-    ],
-    deps = [
-        "@com_github_openssl_openssl//:openssl",
-    ],
-)
diff --git a/bazel/hexl.BUILD b/bazel/hexl.BUILD
deleted file mode 100644
index 40adfc30f..000000000
--- a/bazel/hexl.BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@spulib//bazel:spu.bzl", "spu_cmake_external")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all_srcs",
-    srcs = glob(["**"]),
-)
-
-spu_cmake_external(
-    name = "hexl",
-    cache_entries = {
-        "CMAKE_BUILD_TYPE": "Release",
-        "CpuFeatures_DIR": "$EXT_BUILD_DEPS/cpu_features/lib/cmake/CpuFeatures/",
-        "HEXL_BENCHMARK": "OFF",
-        "HEXL_TESTING": "OFF",
-        "CMAKE_INSTALL_LIBDIR": "lib",
-    },
-    lib_source = ":all_srcs",
-    out_data_dirs = ["lib/cmake"],
-    out_static_libs = ["libhexl.a"],
-    deps = [
-        "@com_github_google_cpu_features//:cpu_features",
-    ],
-)
diff --git a/bazel/nvidia_cutlass.BUILD b/bazel/patches/BUILD.bazel
similarity index 55%
rename from bazel/nvidia_cutlass.BUILD
rename to bazel/patches/BUILD.bazel
index dab20b76d..8289c83ed 100644
--- a/bazel/nvidia_cutlass.BUILD
+++ b/bazel/patches/BUILD.bazel
@@ -4,30 +4,10 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-load("@spulib//bazel:spu.bzl", "spu_cc_library")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all",
-    srcs = glob(["**"]),
-)
-
-spu_cc_library(
-    name = "cutlass",
-    srcs = [],
-    hdrs = glob([
-        "include/**/*.h",
-        "include/**/*.hpp",
-    ]),
-    strip_include_prefix = "include",
-    visibility = ["//visibility:public"],
-)
diff --git a/bazel/patches/emp-ot.patch b/bazel/patches/emp-ot.patch
deleted file mode 100644
index 31cb353a4..000000000
--- a/bazel/patches/emp-ot.patch
+++ /dev/null
@@ -1,99 +0,0 @@
-diff --git a/emp-ot/ferret/ferret_cot.hpp b/emp-ot/ferret/ferret_cot.hpp
-index 9dc8222..fbd6170 100644
---- a/emp-ot/ferret/ferret_cot.hpp
-+++ b/emp-ot/ferret/ferret_cot.hpp
-@@ -28,8 +28,8 @@ FerretCOT<T>::FerretCOT(int party, int threads, T **ios,
- template<typename T>
- FerretCOT<T>::~FerretCOT() {
- 	if (ot_pre_data != nullptr) {
--		if(party == ALICE) write_pre_data128_to_file((void*)ot_pre_data, (__uint128_t)Delta, pre_ot_filename);
--		else write_pre_data128_to_file((void*)ot_pre_data, (__uint128_t)0, pre_ot_filename);
-+		// if(party == ALICE) write_pre_data128_to_file((void*)ot_pre_data, (__uint128_t)Delta, pre_ot_filename);
-+		// else write_pre_data128_to_file((void*)ot_pre_data, (__uint128_t)0, pre_ot_filename);
- 		delete[] ot_pre_data;
- 	}
- 	if (ot_data != nullptr) delete[] ot_data;
-@@ -100,7 +100,9 @@ void FerretCOT<T>::setup(std::string pre_file) {
- 	});
- 
- 	ot_pre_data = new block[param.n_pre];
--	bool hasfile = file_exists(pre_ot_filename), hasfile2;
-+	//bool hasfile = file_exists(pre_ot_filename), hasfile2;
-+	bool hasfile = false; 
-+	bool hasfile2 = false;
- 	if(party == ALICE) {
- 		io->send_data(&hasfile, sizeof(bool));
- 		io->flush();
-
-diff --git a/emp-ot/ferret/mpcot_reg.h b/emp-ot/ferret/mpcot_reg.h
-index 6659aa7..6b01601 100644
---- a/emp-ot/ferret/mpcot_reg.h
-+++ b/emp-ot/ferret/mpcot_reg.h
-@@ -123,6 +123,10 @@ public:
- 		for(int i = start; i < end; ++i)
- 			exec_f2k_sender(senders[i], ot, sparse_vector+i*leave_n, 
- 					ios[threads - 1], i);
-+
-+        for (int i = 0; i < threads; i++)
-+            ios[i]->flush();
-+
- 		for (auto & f : fut) f.get();
- 	}
- 
-@@ -152,7 +156,7 @@ public:
- 			block *ggm_tree_mem, IO *io, int i) {
- 		sender->compute(ggm_tree_mem, Delta_f2k);
- 		sender->template send_f2k<OTPre<IO>>(ot, io, i);
--		io->flush();
-+		//io->flush();
- 		if(is_malicious)
- 			sender->consistency_check_msg_gen(consist_check_VW+i);
- 	}
-
-diff --git a/emp-ot/ferret/preot.h b/emp-ot/ferret/preot.h
-index 0ac7641..a0ae2d3 100644
---- a/emp-ot/ferret/preot.h
-+++ b/emp-ot/ferret/preot.h
-@@ -10,10 +10,6 @@ class OTPre { public:
- 	block * pre_data = nullptr;
- 	bool * bits = nullptr;
- 	int n;
--	vector<block*> pointers;
--	vector<const bool*> choices;
--	vector<const block*> pointers0;
--	vector<const block*> pointers1;
- 
- 	CCRH ccrh;
- 	int length, count;
-
-diff --git a/emp-ot/ferret/twokeyprp.h b/emp-ot/ferret/twokeyprp.h
-index fd6236d..c2361a3 100644
---- a/emp-ot/ferret/twokeyprp.h
-+++ b/emp-ot/ferret/twokeyprp.h
-@@ -9,8 +9,8 @@ class TwoKeyPRP { public:
- 	emp::AES_KEY aes_key[2];
-
- 	TwoKeyPRP(block seed0, block seed1) {
--		AES_set_encrypt_key((const block)seed0, aes_key);
--		AES_set_encrypt_key((const block)seed1, &aes_key[1]);
-+		AES_set_encrypt_key(seed0, aes_key);
-+		AES_set_encrypt_key(seed1, &aes_key[1]);
- 	}
-
- 	void node_expand_1to2(block *children, block parent) {
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index fa06fd7..faf9802 100755
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -12,5 +12,8 @@ include_directories(${EMP-TOOL_INCLUDE_DIRS})
- install(FILES cmake/emp-ot-config.cmake DESTINATION cmake/)
- install(DIRECTORY emp-ot DESTINATION include/)
-
--ENABLE_TESTING()
--ADD_SUBDIRECTORY(test)
-+option(ENABLE_TESTS "Enable tests" OFF)
-+if (${ENABLE_TESTS})
-+    ENABLE_TESTING()
-+    ADD_SUBDIRECTORY(test)
-+endif()
diff --git a/bazel/patches/emp-tool-cmake.patch b/bazel/patches/emp-tool-cmake.patch
deleted file mode 100644
index 01aa13dbe..000000000
--- a/bazel/patches/emp-tool-cmake.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index d9abb31..4c2c171 100755
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -56,11 +56,14 @@ find_package(OpenSSL REQUIRED)
- include_directories(${OPENSSL_INCLUDE_DIR})
- 
- 
--add_library(${NAME} SHARED ${sources})
-+add_library(${NAME} STATIC ${sources})
- 
- install(DIRECTORY emp-tool DESTINATION include/)
- install(DIRECTORY cmake/ DESTINATION cmake/)
- install(TARGETS ${NAME} DESTINATION lib)
- 
--ENABLE_TESTING()
--ADD_SUBDIRECTORY(test)
-+option(ENABLE_TESTS "Enable tests" OFF)
-+if (${ENABLE_TESTS})
-+    ENABLE_TESTING()
-+    ADD_SUBDIRECTORY(test)
-+endif()
diff --git a/bazel/patches/emp-tool-sse2neon.patch b/bazel/patches/emp-tool-sse2neon.patch
deleted file mode 100644
index e94b22e07..000000000
--- a/bazel/patches/emp-tool-sse2neon.patch
+++ /dev/null
@@ -1,6507 +0,0 @@
-diff --git a/emp-tool/utils/sse2neon.h b/emp-tool/utils/sse2neon.h
-index d09b9c7..efa63a4 100644
---- a/emp-tool/utils/sse2neon.h
-+++ b/emp-tool/utils/sse2neon.h
-@@ -113,7 +113,7 @@
- #ifdef _MSC_VER
- #include <intrin.h>
- #if (defined(_M_AMD64) || defined(__x86_64__)) || \
--    (defined(_M_ARM) || defined(__arm__))
-+    (defined(_M_ARM64) || defined(__arm64__))
- #define SSE2NEON_HAS_BITSCAN64
- #endif
- #endif
-@@ -441,7 +441,7 @@ typedef int64x2_t __m128i; /* 128-bit vector containing integers */
- // by applications which attempt to access the contents of an __m128 struct
- // directly.  It is important to note that accessing the __m128 struct directly
- // is bad coding practice by Microsoft: @see:
--// https://docs.microsoft.com/en-us/cpp/cpp/m128
-+// https://learn.microsoft.com/en-us/cpp/cpp/m128
- //
- // However, some legacy source code may try to access the contents of an __m128
- // struct directly so the developer can use the SIMDVec as an alias for it.  Any
-@@ -621,47 +621,6 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
-  *                                  4, 5, 12, 13, 6, 7, 14, 15);
-  *   // Shuffle packed 8-bit integers
-  *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
-- *
-- * Data (Number, Binary, Byte Index):
--    +------+------+-------------+------+------+-------------+
--    |      1      |      2      |      3      |      4      | Number
--    +------+------+------+------+------+------+------+------+
--    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
--    +------+------+------+------+------+------+------+------+
--    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
--    +------+------+------+------+------+------+------+------+
--
--    +------+------+------+------+------+------+------+------+
--    |      5      |      6      |      7      |      8      | Number
--    +------+------+------+------+------+------+------+------+
--    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
--    +------+------+------+------+------+------+------+------+
--    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
--    +------+------+------+------+------+------+------+------+
-- * Index (Byte Index):
--    +------+------+------+------+------+------+------+------+
--    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
--    +------+------+------+------+------+------+------+------+
--
--    +------+------+------+------+------+------+------+------+
--    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
--    +------+------+------+------+------+------+------+------+
-- * Result:
--    +------+------+------+------+------+------+------+------+
--    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
--    +------+------+------+------+------+------+------+------+
--    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
--    +------+------+------+------+------+------+------+------+
--    |     256     |      2      |      5      |      6      | Number
--    +------+------+------+------+------+------+------+------+
--
--    +------+------+------+------+------+------+------+------+
--    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
--    +------+------+------+------+------+------+------+------+
--    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
--    +------+------+------+------+------+------+------+------+
--    |      3      |      7      |      4      |      8      | Number
--    +------+------+------+------+------+------+-------------+
-  */
- 
- /* Constants for use with _mm_prefetch. */
-@@ -1069,9 +1028,9 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-     })
- #endif
- 
--// NEON does not support a general purpose permute intrinsic
--// Selects four specific single-precision, floating-point values from a and b,
--// based on the mask i.
-+// NEON does not support a general purpose permute intrinsic.
-+// Shuffle single-precision (32-bit) floating-point elements in a using the
-+// control in imm8, and store the results in dst.
- //
- // C equivalent:
- //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-@@ -1082,7 +1041,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
- //       return ret;
- //   }
- //
--// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
- #define _mm_shuffle_ps_default(a, b, imm)                                  \
-     __extension__({                                                        \
-         float32x4_t ret;                                                   \
-@@ -1100,12 +1059,10 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-         vreinterpretq_m128_f32(ret);                                       \
-     })
- 
--// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
--// by imm.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
--// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
--//                                                   __constrange(0,255) int
--//                                                   imm)
-+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
-+// Store the results in the low 64 bits of dst, with the high 64 bits being
-+// copied from from a to dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
- #define _mm_shufflelo_epi16_function(a, imm)                                  \
-     __extension__({                                                           \
-         int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
-@@ -1120,12 +1077,10 @@ FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-         vreinterpretq_m128i_s16(ret);                                         \
-     })
- 
--// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
--// by imm.
--// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
--// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
--//                                                   __constrange(0,255) int
--//                                                   imm)
-+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
-+// Store the results in the high 64 bits of dst, with the low 64 bits being
-+// copied from from a to dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
- #define _mm_shufflehi_epi16_function(a, imm)                                   \
-     __extension__({                                                            \
-         int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
-@@ -1147,22 +1102,19 @@ FORCE_INLINE void _mm_empty(void) {}
- 
- /* SSE */
- 
--// Adds the four single-precision, floating-point values of a and b.
--//
--//   r0 := a0 + b0
--//   r1 := a1 + b1
--//   r2 := a2 + b2
--//   r3 := a3 + b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
-+// Add packed single-precision (32-bit) floating-point elements in a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
- FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_f32(
-         vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// adds the scalar single-precision floating point values of a and b.
--// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
-+// Add the lower single-precision (32-bit) floating-point element in a and b,
-+// store the result in the lower element of dst, and copy the upper 3 packed
-+// elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
- FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
- {
-     float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-@@ -1171,30 +1123,18 @@ FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-     return vreinterpretq_m128_f32(vaddq_f32(a, value));
- }
- 
--// Computes the bitwise AND of the four single-precision, floating-point values
--// of a and b.
--//
--//   r0 := a0 & b0
--//   r1 := a1 & b1
--//   r2 := a2 & b2
--//   r3 := a3 & b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
-+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
-+// elements in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
- FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_s32(
-         vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
- }
- 
--// Computes the bitwise AND-NOT of the four single-precision, floating-point
--// values of a and b.
--//
--//   r0 := ~a0 & b0
--//   r1 := ~a1 & b1
--//   r2 := ~a2 & b2
--//   r3 := ~a3 & b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
-+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
-+// elements in a and then AND with b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
- FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_s32(
-@@ -1204,13 +1144,7 @@ FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
- 
- // Average packed unsigned 16-bit integers in a and b, and store the results in
- // dst.
--//
--//   FOR j := 0 to 3
--//     i := j*16
--//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
- FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u16(
-@@ -1219,186 +1153,199 @@ FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
- 
- // Average packed unsigned 8-bit integers in a and b, and store the results in
- // dst.
--//
--//   FOR j := 0 to 7
--//     i := j*8
--//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
- FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u8(
-         vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
- }
- 
--// Compares for equality.
--// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for equality, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
- FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(
-         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// Compares for equality.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for equality, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
- FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
- }
- 
--// Compares for greater than or equal.
--// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for greater-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
- FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(
-         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// Compares for greater than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for greater-than-or-equal, store the result in the lower element of dst,
-+// and copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
- FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpge_ps(a, b));
- }
- 
--// Compares for greater than.
--//
--//   r0 := (a0 > b0) ? 0xffffffff : 0x0
--//   r1 := (a1 > b1) ? 0xffffffff : 0x0
--//   r2 := (a2 > b2) ? 0xffffffff : 0x0
--//   r3 := (a3 > b3) ? 0xffffffff : 0x0
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for greater-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
- FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(
-         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// Compares for greater than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for greater-than, store the result in the lower element of dst, and copy
-+// the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
- FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
- }
- 
--// Compares for less than or equal.
--//
--//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
--//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
--//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
--//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for less-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
- FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(
-         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// Compares for less than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for less-than-or-equal, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
- FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmple_ps(a, b));
- }
- 
--// Compares for less than
--// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for less-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
- FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(
-         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- }
- 
--// Compares for less than
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for less-than, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
- FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmplt_ps(a, b));
- }
- 
--// Compares for inequality.
--// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
- FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(vmvnq_u32(
-         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
- 
--// Compares for inequality.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-equal, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
- FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
- }
- 
--// Compares for not greater than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-greater-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
- FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(vmvnq_u32(
-         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
- 
--// Compares for not greater than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-greater-than-or-equal, store the result in the lower element of
-+// dst, and copy the upper 3 packed elements from a to the upper elements of
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
- FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
- }
- 
--// Compares for not greater than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-greater-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
- FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(vmvnq_u32(
-         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
- 
--// Compares for not greater than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-greater-than, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
- FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
- }
- 
--// Compares for not less than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-less-than-or-equal, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
- FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(vmvnq_u32(
-         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
- 
--// Compares for not less than or equal.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-less-than-or-equal, store the result in the lower element of dst,
-+// and copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
- FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
- }
- 
--// Compares for not less than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// for not-less-than, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
- FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_u32(vmvnq_u32(
-         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
- }
- 
--// Compares for not less than.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b for not-less-than, store the result in the lower element of dst, and copy
-+// the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
- FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
- }
- 
--// Compares the four 32-bit floats in a and b to check if any values are NaN.
--// Ordered compare between each value returns true for "orderable" and false for
--// "not orderable" (NaN).
--// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
--// also:
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// to see if neither is NaN, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
-+//
-+// See also:
- // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
- // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
- FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-@@ -1413,15 +1360,18 @@ FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-     return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
- }
- 
--// Compares for ordered.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b to see if neither is NaN, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
- FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpord_ps(a, b));
- }
- 
--// Compares for unordered.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
-+// Compare packed single-precision (32-bit) floating-point elements in a and b
-+// to see if either is NaN, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
- FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
- {
-     uint32x4_t f32a =
-@@ -1431,16 +1381,18 @@ FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
-     return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
- }
- 
--// Compares for unordered.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b to see if either is NaN, store the result in the lower element of dst, and
-+// copy the upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
- FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using an equality operation. :
--// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for equality, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
- FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
- {
-     uint32x4_t a_eq_b =
-@@ -1448,9 +1400,9 @@ FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-     return vgetq_lane_u32(a_eq_b, 0) & 0x1;
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using a greater than or equal operation. :
--// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for greater-than-or-equal, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
- FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
- {
-     uint32x4_t a_ge_b =
-@@ -1458,9 +1410,9 @@ FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-     return vgetq_lane_u32(a_ge_b, 0) & 0x1;
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using a greater than operation. :
--// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for greater-than, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
- FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
- {
-     uint32x4_t a_gt_b =
-@@ -1468,9 +1420,9 @@ FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-     return vgetq_lane_u32(a_gt_b, 0) & 0x1;
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using a less than or equal operation. :
--// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for less-than-or-equal, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
- FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
- {
-     uint32x4_t a_le_b =
-@@ -1478,11 +1430,9 @@ FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-     return vgetq_lane_u32(a_le_b, 0) & 0x1;
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using a less than operation. :
--// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
--// note!! The documentation on MSDN is incorrect!  If either of the values is a
--// NAN the docs say you will get a one, but in fact, it will return a zero!!
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for less-than, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
- FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
- {
-     uint32x4_t a_lt_b =
-@@ -1490,9 +1440,9 @@ FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-     return vgetq_lane_u32(a_lt_b, 0) & 0x1;
- }
- 
--// Compares the lower single-precision floating point scalar values of a and b
--// using an inequality operation. :
--// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
-+// Compare the lower single-precision (32-bit) floating-point element in a and b
-+// for not-equal, and return the boolean result (0 or 1).
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
- FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
- {
-     return !_mm_comieq_ss(a, b);
-@@ -1502,13 +1452,7 @@ FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
- // (32-bit) floating-point elements, store the results in the lower 2 elements
- // of dst, and copy the upper 2 packed elements from a to the upper elements of
- // dst.
--//
--//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
--//   dst[95:64] := a[95:64]
--//   dst[127:96] := a[127:96]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
- FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
- {
-     return vreinterpretq_m128_f32(
-@@ -1518,13 +1462,7 @@ FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
- 
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//       i := 32*j
--//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
- FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -1539,11 +1477,7 @@ FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
- // Convert the signed 32-bit integer b to a single-precision (32-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper 3 packed elements from a to the upper elements of dst.
--//
--//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
- FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
- {
-     return vreinterpretq_m128_f32(
-@@ -1552,7 +1486,7 @@ FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer, and store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
- FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -1567,14 +1501,7 @@ FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
- 
- // Convert packed 16-bit integers in a to packed single-precision (32-bit)
- // floating-point elements, and store the results in dst.
--//
--//   FOR j := 0 to 3
--//      i := j*16
--//      m := j*32
--//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
- FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
- {
-     return vreinterpretq_m128_f32(
-@@ -1584,13 +1511,7 @@ FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
- // Convert packed 32-bit integers in b to packed single-precision (32-bit)
- // floating-point elements, store the results in the lower 2 elements of dst,
- // and copy the upper 2 packed elements from a to the upper elements of dst.
--//
--//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
--//   dst[95:64] := a[95:64]
--//   dst[127:96] := a[127:96]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
- FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
- {
-     return vreinterpretq_m128_f32(
-@@ -1603,13 +1524,7 @@ FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
- // of dst, then convert the packed signed 32-bit integers in b to
- // single-precision (32-bit) floating-point element, and store the results in
- // the upper 2 elements of dst.
--//
--//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
--//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
--//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
--//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
- FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
- {
-     return vreinterpretq_m128_f32(vcvtq_f32_s32(
-@@ -1618,14 +1533,7 @@ FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
- 
- // Convert the lower packed 8-bit integers in a to packed single-precision
- // (32-bit) floating-point elements, and store the results in dst.
--//
--//   FOR j := 0 to 3
--//      i := j*8
--//      m := j*32
--//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
- FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
- {
-     return vreinterpretq_m128_f32(vcvtq_f32_s32(
-@@ -1636,18 +1544,7 @@ FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
- // packed 16-bit integers, and store the results in dst. Note: this intrinsic
- // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
- // 0x7FFFFFFF.
--//
--//   FOR j := 0 to 3
--//     i := 16*j
--//     k := 32*j
--//     IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
--//       dst[i+15:i] := 0x7FFF
--//     ELSE
--//       dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
- FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
- {
-     return vreinterpret_m64_s16(
-@@ -1656,31 +1553,14 @@ FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
- 
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//       i := 32*j
--//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
- #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
- 
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 8-bit integers, and store the results in lower 4 elements of dst.
- // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
- // between 0x7F and 0x7FFFFFFF.
--//
--//   FOR j := 0 to 3
--//     i := 8*j
--//     k := 32*j
--//     IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
--//       dst[i+7:i] := 0x7F
--//     ELSE
--//       dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
- FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
- {
-     return vreinterpret_m64_s8(vqmovn_s16(
-@@ -1689,14 +1569,7 @@ FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
- 
- // Convert packed unsigned 16-bit integers in a to packed single-precision
- // (32-bit) floating-point elements, and store the results in dst.
--//
--//   FOR j := 0 to 3
--//      i := j*16
--//      m := j*32
--//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
- FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
- {
-     return vreinterpretq_m128_f32(
-@@ -1706,14 +1579,7 @@ FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
- // Convert the lower packed unsigned 8-bit integers in a to packed
- // single-precision (32-bit) floating-point elements, and store the results in
- // dst.
--//
--//   FOR j := 0 to 3
--//      i := j*8
--//      m := j*32
--//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
- FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
- {
-     return vreinterpretq_m128_f32(vcvtq_f32_u32(
-@@ -1723,21 +1589,13 @@ FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
- // Convert the signed 32-bit integer b to a single-precision (32-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper 3 packed elements from a to the upper elements of dst.
--//
--//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
- #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
- 
- // Convert the signed 64-bit integer b to a single-precision (32-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper 3 packed elements from a to the upper elements of dst.
--//
--//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
- FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
- {
-     return vreinterpretq_m128_f32(
-@@ -1745,10 +1603,7 @@ FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
- }
- 
- // Copy the lower single-precision (32-bit) floating-point element of a to dst.
--//
--//   dst[31:0] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
- FORCE_INLINE float _mm_cvtss_f32(__m128 a)
- {
-     return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-@@ -1756,18 +1611,12 @@ FORCE_INLINE float _mm_cvtss_f32(__m128 a)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer, and store the result in dst.
--//
--//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
- #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 64-bit integer, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
- FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -1781,13 +1630,7 @@ FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
- 
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//      i := 32*j
--//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
- FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
- {
-     return vreinterpret_m64_s32(
-@@ -1796,10 +1639,7 @@ FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer with truncation, and store the result in dst.
--//
--//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
- FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
- {
-     return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-@@ -1807,60 +1647,49 @@ FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
- 
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//      i := 32*j
--//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
- #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 32-bit integer with truncation, and store the result in dst.
--//
--//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
- #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
- 
- // Convert the lower single-precision (32-bit) floating-point element in a to a
- // 64-bit integer with truncation, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
- FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
- {
-     return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
- }
- 
--// Divides the four single-precision, floating-point values of a and b.
--//
--//   r0 := a0 / b0
--//   r1 := a1 / b1
--//   r2 := a2 / b2
--//   r3 := a3 / b3
--//
--// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
-+// Divide packed single-precision (32-bit) floating-point elements in a by
-+// packed elements in b, and store the results in dst.
-+// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
-+// division by multiplying a by b's reciprocal before using the Newton-Raphson
-+// method to approximate the results.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
- FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
- {
--#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
-+#if defined(__aarch64__)
-     return vreinterpretq_m128_f32(
-         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- #else
-     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
-     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
--#if SSE2NEON_PRECISE_DIV
-     // Additional Netwon-Raphson iteration for accuracy
-     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
--#endif
-     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
- #endif
- }
- 
--// Divides the scalar single-precision floating point value of a by b.
--// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
-+// Divide the lower single-precision (32-bit) floating-point element in a by the
-+// lower single-precision (32-bit) floating-point element in b, store the result
-+// in the lower element of dst, and copy the upper 3 packed elements from a to
-+// the upper elements of dst.
-+// Warning: ARMv7-A does not produce the same result compared to Intel and not
-+// IEEE-compliant.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
- FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
- {
-     float32_t value =
-@@ -1871,12 +1700,12 @@ FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
- 
- // Extract a 16-bit integer from a, selected with imm8, and store the result in
- // the lower element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
- #define _mm_extract_pi16(a, imm) \
-     (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
- 
- // Free aligned memory that was allocated with _mm_malloc.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
- #if !defined(SSE2NEON_ALLOC_DEFINED)
- FORCE_INLINE void _mm_free(void *addr)
- {
-@@ -1887,7 +1716,7 @@ FORCE_INLINE void _mm_free(void *addr)
- // Macro: Get the flush zero bits from the MXCSR control and status register.
- // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
- // _MM_FLUSH_ZERO_OFF
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
- FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
- {
-     union {
-@@ -1911,7 +1740,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
- // Macro: Get the rounding mode bits from the MXCSR control and status register.
- // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
- // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
- FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
- {
-     union {
-@@ -1938,15 +1767,17 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
- 
- // Copy a to dst, and insert the 16-bit integer i into dst at the location
- // specified by imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
- #define _mm_insert_pi16(a, b, imm)                               \
-     __extension__({                                              \
-         vreinterpret_m64_s16(                                    \
-             vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
-     })
- 
--// Loads four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-+// boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
- FORCE_INLINE __m128 _mm_load_ps(const float *p)
- {
-     return vreinterpretq_m128_f32(vld1q_f32(p));
-@@ -1960,52 +1791,40 @@ FORCE_INLINE __m128 _mm_load_ps(const float *p)
- //   dst[95:64] := MEM[mem_addr+31:mem_addr]
- //   dst[127:96] := MEM[mem_addr+31:mem_addr]
- //
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
- #define _mm_load_ps1 _mm_load1_ps
- 
--// Loads an single - precision, floating - point value into the low word and
--// clears the upper three words.
--// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-+// Load a single-precision (32-bit) floating-point element from memory into the
-+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
-+// aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
- FORCE_INLINE __m128 _mm_load_ss(const float *p)
- {
-     return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
- }
- 
--// Loads a single single-precision, floating-point value, copying it into all
--// four words
--// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-+// Load a single-precision (32-bit) floating-point element from memory into all
-+// elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
- FORCE_INLINE __m128 _mm_load1_ps(const float *p)
- {
-     return vreinterpretq_m128_f32(vld1q_dup_f32(p));
- }
- 
--// Sets the upper two single-precision, floating-point values with 64
--// bits of data loaded from the address p; the lower two values are passed
--// through from a.
--//
--//   r0 := a0
--//   r1 := a1
--//   r2 := *p0
--//   r3 := *p1
--//
--// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
-+// Load 2 single-precision (32-bit) floating-point elements from memory into the
-+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
-+// mem_addr does not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
- FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
- {
-     return vreinterpretq_m128_f32(
-         vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
- }
- 
--// Sets the lower two single-precision, floating-point values with 64
--// bits of data loaded from the address p; the upper two values are passed
--// through from a.
--//
--// Return Value
--//   r0 := *p0
--//   r1 := *p1
--//   r2 := a2
--//   r3 := a3
--//
--// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
-+// Load 2 single-precision (32-bit) floating-point elements from memory into the
-+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
-+// mem_addr does not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
- FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
- {
-     return vreinterpretq_m128_f32(
-@@ -2015,21 +1834,17 @@ FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
- // Load 4 single-precision (32-bit) floating-point elements from memory into dst
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
--//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
--//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
--//   dst[127:96] := MEM[mem_addr+31:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
- FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
- {
-     float32x4_t v = vrev64q_f32(vld1q_f32(p));
-     return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
- }
- 
--// Loads four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from memory into dst. mem_addr does not need to be aligned on any
-+// particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
- FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
- {
-     // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-@@ -2038,11 +1853,7 @@ FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
- }
- 
- // Load unaligned 16-bit integer from memory into the first element of dst.
--//
--//   dst[15:0] := MEM[mem_addr+15:mem_addr]
--//   dst[MAX:16] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
- FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
- {
-     return vreinterpretq_m128i_s16(
-@@ -2050,20 +1861,17 @@ FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
- }
- 
- // Load unaligned 64-bit integer from memory into the first element of dst.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[MAX:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
- FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
- {
-     return vreinterpretq_m128i_s64(
-         vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
- }
- 
--// Allocate aligned blocks of memory.
--// https://software.intel.com/en-us/
--//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
-+// Allocate size bytes of memory, aligned to the alignment specified in align,
-+// and return a pointer to the allocated memory. _mm_free should be used to free
-+// memory that is allocated with _mm_malloc.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
- #if !defined(SSE2NEON_ALLOC_DEFINED)
- FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
- {
-@@ -2081,7 +1889,7 @@ FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
- // Conditionally store 8-bit integer elements from a into memory using mask
- // (elements are not stored when the highest bit is not set in the corresponding
- // element) and a non-temporal memory hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
- FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
- {
-     int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
-@@ -2095,27 +1903,23 @@ FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
- // Conditionally store 8-bit integer elements from a into memory using mask
- // (elements are not stored when the highest bit is not set in the corresponding
- // element) and a non-temporal memory hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
- #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
- 
- // Compare packed signed 16-bit integers in a and b, and store packed maximum
- // values in dst.
--//
--//   FOR j := 0 to 3
--//      i := j*16
--//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
- FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s16(
-         vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
- }
- 
--// Computes the maximums of the four single-precision, floating-point values of
--// a and b.
--// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b,
-+// and store packed maximum values in dst. dst does not follow the IEEE Standard
-+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
-+// signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
- FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
- {
- #if SSE2NEON_PRECISE_MINMAX
-@@ -2130,22 +1934,19 @@ FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
- 
- // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
- // values in dst.
--//
--//   FOR j := 0 to 7
--//      i := j*8
--//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
- FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u8(
-         vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
- }
- 
--// Computes the maximum of the two lower scalar single-precision floating point
--// values of a and b.
--// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b, store the maximum value in the lower element of dst, and copy the upper 3
-+// packed elements from a to the upper element of dst. dst does not follow the
-+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
-+// inputs are NaN or signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
- FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
- {
-     float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
-@@ -2155,22 +1956,18 @@ FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
- 
- // Compare packed signed 16-bit integers in a and b, and store packed minimum
- // values in dst.
--//
--//   FOR j := 0 to 3
--//      i := j*16
--//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
- FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s16(
-         vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
- }
- 
--// Computes the minima of the four single-precision, floating-point values of a
--// and b.
--// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
-+// Compare packed single-precision (32-bit) floating-point elements in a and b,
-+// and store packed minimum values in dst. dst does not follow the IEEE Standard
-+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
-+// signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
- FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
- {
- #if SSE2NEON_PRECISE_MINMAX
-@@ -2185,22 +1982,19 @@ FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
- 
- // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
- // values in dst.
--//
--//   FOR j := 0 to 7
--//      i := j*8
--//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
- FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u8(
-         vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
- }
- 
--// Computes the minimum of the two lower scalar single-precision floating point
--// values of a and b.
--// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
-+// Compare the lower single-precision (32-bit) floating-point elements in a and
-+// b, store the minimum value in the lower element of dst, and copy the upper 3
-+// packed elements from a to the upper element of dst. dst does not follow the
-+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
-+// inputs are NaN or signed-zero values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
- FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
- {
-     float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
-@@ -2208,8 +2002,10 @@ FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
- }
- 
--// Sets the low word to the single-precision, floating-point value of b
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
-+// Move the lower single-precision (32-bit) floating-point element from b to the
-+// lower element of dst, and copy the upper 3 packed elements from a to the
-+// upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
- FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_f32(
-@@ -2217,25 +2013,26 @@ FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
-                        vreinterpretq_f32_m128(a), 0));
- }
- 
--// Moves the upper two values of B into the lower two values of A.
--//
--//   r3 := a3
--//   r2 := a2
--//   r1 := b3
--//   r0 := b2
--FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
--{
--    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
--    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
-+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
-+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
-+// upper 2 elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
-+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
-+{
-+#if defined(aarch64__)
-+    return vreinterpretq_m128_u64(
-+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
-+#else
-+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-     return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-+#endif
- }
- 
--// Moves the lower two values of B into the upper two values of A.
--//
--//   r3 := b1
--//   r2 := b0
--//   r1 := a1
--//   r0 := a0
-+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
-+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
-+// lower 2 elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
- FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
- {
-     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
-@@ -2245,7 +2042,7 @@ FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
- 
- // Create mask from the most significant bit of each 8-bit element in a, and
- // store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
- FORCE_INLINE int _mm_movemask_pi8(__m64 a)
- {
-     uint8x8_t input = vreinterpret_u8_m64(a);
-@@ -2264,10 +2061,9 @@ FORCE_INLINE int _mm_movemask_pi8(__m64 a)
- #endif
- }
- 
--// NEON does not provide this method
--// Creates a 4-bit mask from the most significant bits of the four
--// single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-+// Set each bit of mask dst based on the most significant bit of the
-+// corresponding packed single-precision (32-bit) floating-point element in a.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
- FORCE_INLINE int _mm_movemask_ps(__m128 a)
- {
-     uint32x4_t input = vreinterpretq_u32_m128(a);
-@@ -2288,14 +2084,9 @@ FORCE_INLINE int _mm_movemask_ps(__m128 a)
- #endif
- }
- 
--// Multiplies the four single-precision, floating-point values of a and b.
--//
--//   r0 := a0 * b0
--//   r1 := a1 * b1
--//   r2 := a2 * b2
--//   r3 := a3 * b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
-+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
- FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_f32(
-@@ -2305,11 +2096,7 @@ FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
- // Multiply the lower single-precision (32-bit) floating-point element in a and
- // b, store the result in the lower element of dst, and copy the upper 3 packed
- // elements from a to the upper elements of dst.
--//
--//   dst[31:0] := a[31:0] * b[31:0]
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
- FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_mul_ps(a, b));
-@@ -2318,16 +2105,16 @@ FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
- // Multiply the packed unsigned 16-bit integers in a and b, producing
- // intermediate 32-bit integers, and store the high 16 bits of the intermediate
- // integers in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
- FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u16(vshrn_n_u32(
-         vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
- }
- 
--// Computes the bitwise OR of the four single-precision, floating-point values
--// of a and b.
--// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
-+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
-+// elements in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
- FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_s32(
-@@ -2336,65 +2123,53 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
- 
- // Average packed unsigned 8-bit integers in a and b, and store the results in
- // dst.
--//
--//   FOR j := 0 to 7
--//     i := j*8
--//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
- #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
- 
- // Average packed unsigned 16-bit integers in a and b, and store the results in
- // dst.
--//
--//   FOR j := 0 to 3
--//     i := j*16
--//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
- #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
- 
- // Extract a 16-bit integer from a, selected with imm8, and store the result in
- // the lower element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
- #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
- 
- // Copy a to dst, and insert the 16-bit integer i into dst at the location
- // specified by imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
- #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
- 
- // Compare packed signed 16-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
- #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
- 
- // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
- #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
- 
- // Compare packed signed 16-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
- #define _m_pminsw(a, b) _mm_min_pi16(a, b)
- 
- // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
- #define _m_pminub(a, b) _mm_min_pu8(a, b)
- 
- // Create mask from the most significant bit of each 8-bit element in a, and
- // store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
- #define _m_pmovmskb(a) _mm_movemask_pi8(a)
- 
- // Multiply the packed unsigned 16-bit integers in a and b, producing
- // intermediate 32-bit integers, and store the high 16 bits of the intermediate
- // integers in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
- #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
- 
- // Fetch the line of data from memory that contains address p to a location in
-@@ -2422,26 +2197,22 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
- // b, then horizontally sum each consecutive 8 differences to produce four
- // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
- // 16 bits of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
- #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
- 
- // Shuffle 16-bit integers in a using the control in imm8, and store the results
- // in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
- #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
- 
- // Compute the approximate reciprocal of packed single-precision (32-bit)
- // floating-point elements in a, and store the results in dst. The maximum
- // relative error for this approximation is less than 1.5*2^-12.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
- FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
- {
-     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
--#if SSE2NEON_PRECISE_DIV
--    // Additional Netwon-Raphson iteration for accuracy
--    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
--#endif
-     return vreinterpretq_m128_f32(recip);
- }
- 
-@@ -2449,30 +2220,21 @@ FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
- // floating-point element in a, store the result in the lower element of dst,
- // and copy the upper 3 packed elements from a to the upper elements of dst. The
- // maximum relative error for this approximation is less than 1.5*2^-12.
--//
--//   dst[31:0] := (1.0 / a[31:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
- FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
- {
-     return _mm_move_ss(a, _mm_rcp_ps(a));
- }
- 
--// Computes the approximations of the reciprocal square roots of the four
--// single-precision floating point values of in.
--// The current precision is 1% error.
--// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
-+// Compute the approximate reciprocal square root of packed single-precision
-+// (32-bit) floating-point elements in a, and store the results in dst. The
-+// maximum relative error for this approximation is less than 1.5*2^-12.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
- FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
- {
-     float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
--#if SSE2NEON_PRECISE_SQRT
--    // Additional Netwon-Raphson iteration for accuracy
-     out = vmulq_f32(
-         out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
--    out = vmulq_f32(
--        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
--#endif
-     return vreinterpretq_m128_f32(out);
- }
- 
-@@ -2480,7 +2242,7 @@ FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
- // (32-bit) floating-point element in a, store the result in the lower element
- // of dst, and copy the upper 3 packed elements from a to the upper elements of
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
- FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
- {
-     return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-@@ -2490,7 +2252,7 @@ FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
- // b, then horizontally sum each consecutive 8 differences to produce four
- // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
- // 16 bits of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
- FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
- {
-     uint64x1_t t = vpaddl_u32(vpaddl_u16(
-@@ -2502,7 +2264,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
- // Macro: Set the flush zero bits of the MXCSR control and status register to
- // the value in unsigned 32-bit integer a. The flush zero may contain any of the
- // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
- FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
- {
-     // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
-@@ -2531,16 +2293,18 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
- #endif
- }
- 
--// Sets the four single-precision, floating-point values to the four inputs.
--// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
-+// Set packed single-precision (32-bit) floating-point elements in dst with the
-+// supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
- FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
- {
-     float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
-     return vreinterpretq_m128_f32(vld1q_f32(data));
- }
- 
--// Sets the four single-precision, floating-point values to w.
--// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-+// Broadcast single-precision (32-bit) floating-point value a to all elements of
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
- FORCE_INLINE __m128 _mm_set_ps1(float _w)
- {
-     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-@@ -2550,7 +2314,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
- // the value in unsigned 32-bit integer a. The rounding mode may contain any of
- // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
- // _MM_ROUND_TOWARD_ZERO
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
- FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
- {
-     union {
-@@ -2595,45 +2359,48 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
- 
- // Copy single-precision (32-bit) floating-point element a to the lower element
- // of dst, and zero the upper 3 elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
- FORCE_INLINE __m128 _mm_set_ss(float a)
- {
-     return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
- }
- 
--// Sets the four single-precision, floating-point values to w.
--//
--//   r0 := r1 := r2 := r3 := w
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-+// Broadcast single-precision (32-bit) floating-point value a to all elements of
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
- FORCE_INLINE __m128 _mm_set1_ps(float _w)
- {
-     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
- }
- 
-+// Set the MXCSR control and status register with the value in unsigned 32-bit
-+// integer a.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
- // FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
- FORCE_INLINE void _mm_setcsr(unsigned int a)
- {
-     _MM_SET_ROUNDING_MODE(a);
- }
- 
-+// Get the unsigned 32-bit value of the MXCSR control and status register.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
- // FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
- FORCE_INLINE unsigned int _mm_getcsr()
- {
-     return _MM_GET_ROUNDING_MODE();
- }
- 
--// Sets the four single-precision, floating-point values to the four inputs in
--// reverse order.
--// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-+// Set packed single-precision (32-bit) floating-point elements in dst with the
-+// supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
- FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
- {
-     float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
-     return vreinterpretq_m128_f32(vld1q_f32(data));
- }
- 
--// Clears the four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
-+// Return vector of type __m128 with all elements set to zero.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
- FORCE_INLINE __m128 _mm_setzero_ps(void)
- {
-     return vreinterpretq_m128_f32(vdupq_n_f32(0));
-@@ -2641,7 +2408,7 @@ FORCE_INLINE __m128 _mm_setzero_ps(void)
- 
- // Shuffle 16-bit integers in a using the control in imm8, and store the results
- // in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
- #ifdef _sse2neon_shuffle
- #define _mm_shuffle_pi16(a, imm)                                           \
-     __extension__({                                                        \
-@@ -2775,19 +2542,17 @@ FORCE_INLINE void _mm_lfence(void)
-     })
- #endif
- 
--// Computes the approximations of square roots of the four single-precision,
--// floating-point values of a. First computes reciprocal square roots and then
--// reciprocals of the four values.
--//
--//   r0 := sqrt(a0)
--//   r1 := sqrt(a1)
--//   r2 := sqrt(a2)
--//   r3 := sqrt(a3)
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
-+// Compute the square root of packed single-precision (32-bit) floating-point
-+// elements in a, and store the results in dst.
-+// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
-+// square root by multiplying input in with its reciprocal square root before
-+// using the Newton-Raphson method to approximate the results.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
- FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
- {
--#if SSE2NEON_PRECISE_SQRT
-+#if defined(__aarch64__)
-+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-+#else
-     float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
- 
-     // Test for vrsqrteq_f32(0) -> positive infinity case.
-@@ -2798,28 +2563,23 @@ FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-     recip = vreinterpretq_f32_u32(
-         vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
- 
--    // Additional Netwon-Raphson iteration for accuracy
-     recip = vmulq_f32(
-         vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-         recip);
-+    // Additional Netwon-Raphson iteration for accuracy
-     recip = vmulq_f32(
-         vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
-         recip);
- 
-     // sqrt(s) = s * 1/sqrt(s)
-     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
--#elif defined(__aarch64__)
--    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
--#else
--    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
--    float32x4_t sq = vrecpeq_f32(recipsq);
--    return vreinterpretq_m128_f32(sq);
- #endif
- }
- 
--// Computes the approximation of the square root of the scalar single-precision
--// floating point value of in.
--// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
-+// Compute the square root of the lower single-precision (32-bit) floating-point
-+// element in a, store the result in the lower element of dst, and copy the
-+// upper 3 packed elements from a to the upper elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
- FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
- {
-     float32_t value =
-@@ -2828,8 +2588,10 @@ FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-         vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
- }
- 
--// Stores four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
-+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-+// or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
- FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
- {
-     vst1q_f32(p, vreinterpretq_f32_m128(a));
-@@ -2838,21 +2600,16 @@ FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
- // Store the lower single-precision (32-bit) floating-point element from a into
- // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--//
--//   MEM[mem_addr+31:mem_addr] := a[31:0]
--//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
--//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
--//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
- FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
- {
-     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-     vst1q_f32(p, vdupq_n_f32(a0));
- }
- 
--// Stores the lower single - precision, floating - point value.
--// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
-+// Store the lower single-precision (32-bit) floating-point element from a into
-+// memory. mem_addr does not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
- FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
- {
-     vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-@@ -2861,34 +2618,20 @@ FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
- // Store the lower single-precision (32-bit) floating-point element from a into
- // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--//
--//   MEM[mem_addr+31:mem_addr] := a[31:0]
--//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
--//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
--//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
- #define _mm_store1_ps _mm_store_ps1
- 
--// Stores the upper two single-precision, floating-point values of a to the
--// address p.
--//
--//   *p0 := a2
--//   *p1 := a3
--//
--// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
-+// Store the upper 2 single-precision (32-bit) floating-point elements from a
-+// into memory.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
- FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
- {
-     *p = vreinterpret_m64_f32(vget_high_f32(a));
- }
- 
--// Stores the lower two single-precision floating point values of a to the
--// address p.
--//
--//   *p0 := a0
--//   *p1 := a1
--//
--// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
-+// Store the lower 2 single-precision (32-bit) floating-point elements from a
-+// into memory.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
- FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
- {
-     *p = vreinterpret_m64_f32(vget_low_f32(a));
-@@ -2897,13 +2640,7 @@ FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
- // Store 4 single-precision (32-bit) floating-point elements from a into memory
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--//   MEM[mem_addr+31:mem_addr] := a[127:96]
--//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
--//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
--//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
- FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
- {
-     float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
-@@ -2911,22 +2648,24 @@ FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
-     vst1q_f32(p, rev);
- }
- 
--// Stores four single-precision, floating-point values.
--// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
-+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
-+// elements) from a into memory. mem_addr does not need to be aligned on any
-+// particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
- FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
- {
-     vst1q_f32(p, vreinterpretq_f32_m128(a));
- }
- 
- // Stores 16-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
- FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
- {
-     vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
- }
- 
- // Stores 64-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
- FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
- {
-     vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
-@@ -2934,7 +2673,7 @@ FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
- 
- // Store 64-bits of integer data from a into memory using a non-temporal memory
- // hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
- FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
- {
-     vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
-@@ -2942,7 +2681,7 @@ FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
- 
- // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
- // point elements) from a into memory using a non-temporal memory hint.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
- FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -2952,14 +2691,10 @@ FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
- #endif
- }
- 
--// Subtracts the four single-precision, floating-point values of a and b.
--//
--//   r0 := a0 - b0
--//   r1 := a1 - b1
--//   r2 := a2 - b2
--//   r3 := a3 - b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
-+// Subtract packed single-precision (32-bit) floating-point elements in b from
-+// packed single-precision (32-bit) floating-point elements in a, and store the
-+// results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
- FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_f32(
-@@ -2970,11 +2705,7 @@ FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
- // the lower single-precision (32-bit) floating-point element in a, store the
- // result in the lower element of dst, and copy the upper 3 packed elements from
- // a to the upper elements of dst.
--//
--//   dst[31:0] := a[31:0] - b[31:0]
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
- FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_sub_ps(a, b));
-@@ -2983,7 +2714,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
- // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
- // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
- // transposed matrix in these vectors (row0 now contains column 0, etc.).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
- #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
-     do {                                                  \
-         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
-@@ -3008,7 +2739,7 @@ FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
- #define _mm_ucomineq_ss _mm_comineq_ss
- 
- // Return vector of type __m128i with undefined elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
- FORCE_INLINE __m128i _mm_undefined_si128(void)
- {
- #if defined(__GNUC__) || defined(__clang__)
-@@ -3023,7 +2754,7 @@ FORCE_INLINE __m128i _mm_undefined_si128(void)
- }
- 
- // Return vector of type __m128 with undefined elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
- FORCE_INLINE __m128 _mm_undefined_ps(void)
- {
- #if defined(__GNUC__) || defined(__clang__)
-@@ -3037,15 +2768,9 @@ FORCE_INLINE __m128 _mm_undefined_ps(void)
- #endif
- }
- 
--// Selects and interleaves the upper two single-precision, floating-point values
--// from a and b.
--//
--//   r0 := a2
--//   r1 := b2
--//   r2 := a3
--//   r3 := b3
--//
--// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
-+// Unpack and interleave single-precision (32-bit) floating-point elements from
-+// the high half a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
- FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
- {
- #if defined(__aarch64__)
-@@ -3059,15 +2784,9 @@ FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
- #endif
- }
- 
--// Selects and interleaves the lower two single-precision, floating-point values
--// from a and b.
--//
--//   r0 := a0
--//   r1 := b0
--//   r2 := a1
--//   r3 := b1
--//
--// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
-+// Unpack and interleave single-precision (32-bit) floating-point elements from
-+// the low half of a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
- FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
- {
- #if defined(__aarch64__)
-@@ -3081,9 +2800,9 @@ FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
- #endif
- }
- 
--// Computes bitwise EXOR (exclusive-or) of the four single-precision,
--// floating-point values of a and b.
--// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
-+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
-+// elements in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
- FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
- {
-     return vreinterpretq_m128_s32(
-@@ -3092,42 +2811,32 @@ FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
- 
- /* SSE2 */
- 
--// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
--// unsigned 16-bit integers in b.
--// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
-+// Add packed 16-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
- FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-         vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
--// unsigned 32-bit integers in b.
--//
--//   r0 := a0 + b0
--//   r1 := a1 + b1
--//   r2 := a2 + b2
--//   r3 := a3 + b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-+// Add packed 32-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
- FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-         vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
--// unsigned 32-bit integers in b.
--// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-+// Add packed 64-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
- FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s64(
-         vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
- }
- 
--// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
--// unsigned 8-bit integers in b.
--// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
-+// Add packed 8-bit integers in a and b, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
- FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -3136,7 +2845,7 @@ FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
- 
- // Add packed double-precision (64-bit) floating-point elements in a and b, and
- // store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
- FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3155,11 +2864,7 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
- // Add the lower double-precision (64-bit) floating-point element in a and b,
- // store the result in the lower element of dst, and copy the upper element from
- // a to the upper element of dst.
--//
--//   dst[63:0] := a[63:0] + b[63:0]
--//   dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
- FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3175,25 +2880,16 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
- }
- 
- // Add 64-bit integers a and b, and store the result in dst.
--//
--//   dst[63:0] := a[63:0] + b[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
- FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s64(
-         vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
- }
- 
--// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
--// and saturates.
--//
--//   r0 := SignedSaturate(a0 + b0)
--//   r1 := SignedSaturate(a1 + b1)
--//   ...
--//   r7 := SignedSaturate(a7 + b7)
--//
--// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
-+// Add packed signed 16-bit integers in a and b using saturation, and store the
-+// results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
- FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-@@ -3202,13 +2898,7 @@ FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
- 
- // Add packed signed 8-bit integers in a and b using saturation, and store the
- // results in dst.
--//
--//   FOR j := 0 to 15
--//     i := j*8
--//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
- FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -3217,16 +2907,16 @@ FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
- 
- // Add packed unsigned 16-bit integers in a and b using saturation, and store
- // the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
- FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-         vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
- }
- 
--// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
--// b and saturates..
--// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
-+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
-+// results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
- FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -3235,25 +2925,16 @@ FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
- 
- // Compute the bitwise AND of packed double-precision (64-bit) floating-point
- // elements in a and b, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
- FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
- {
-     return vreinterpretq_m128d_s64(
-         vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
- }
- 
--// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
--// b.
--//
--//   r := a & b
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
-+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-+// and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
- FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-@@ -3262,13 +2943,7 @@ FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
- 
- // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
- // elements in a and then AND with b, and store the results in dst.
--//
--//   FOR j := 0 to 1
--// 	     i := j*64
--// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
- FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
- {
-     // *NOTE* argument swap
-@@ -3276,12 +2951,9 @@ FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
-         vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
- }
- 
--// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
--// 128-bit value in a.
--//
--//   r := (~a) & b
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
-+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
-+// AND with b, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
- FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-@@ -3289,30 +2961,18 @@ FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-                   vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
- }
- 
--// Computes the average of the 8 unsigned 16-bit integers in a and the 8
--// unsigned 16-bit integers in b and rounds.
--//
--//   r0 := (a0 + b0) / 2
--//   r1 := (a1 + b1) / 2
--//   ...
--//   r7 := (a7 + b7) / 2
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
-+// Average packed unsigned 16-bit integers in a and b, and store the results in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
- FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
- {
-     return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
-                                  vreinterpretq_u16_m128i(b));
- }
- 
--// Computes the average of the 16 unsigned 8-bit integers in a and the 16
--// unsigned 8-bit integers in b and rounds.
--//
--//   r0 := (a0 + b0) / 2
--//   r1 := (a1 + b1) / 2
--//   ...
--//   r15 := (a15 + b15) / 2
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
-+// Average packed unsigned 8-bit integers in a and b, and store the results in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
- FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -3321,17 +2981,17 @@ FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
- 
- // Shift a left by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
- #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
- 
- // Shift a right by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
- #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
- 
- // Cast vector of type __m128d to type __m128. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
- FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
- {
-     return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
-@@ -3339,7 +2999,7 @@ FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
- 
- // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
- FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
- {
-     return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
-@@ -3347,15 +3007,15 @@ FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
- 
- // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
- FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
- {
-     return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
- }
- 
--// Applies a type cast to reinterpret four 32-bit floating point values passed
--// in as a 128-bit parameter as packed 32-bit integers.
--// https://msdn.microsoft.com/en-us/library/bb514099.aspx
-+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
-+// compilation and does not generate any instructions, thus it has zero latency.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
- FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
- {
-     return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-@@ -3363,7 +3023,7 @@ FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
- 
- // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
- // compilation and does not generate any instructions, thus it has zero latency.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
- FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
- {
- #if defined(__aarch64__)
-@@ -3373,9 +3033,9 @@ FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
- #endif
- }
- 
--// Applies a type cast to reinterpret four 32-bit integers passed in as a
--// 128-bit parameter as packed 32-bit floating point values.
--// https://msdn.microsoft.com/en-us/library/bb514029.aspx
-+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
-+// compilation and does not generate any instructions, thus it has zero latency.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
- FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
- {
-     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-@@ -3406,9 +3066,9 @@ FORCE_INLINE void _mm_clflush(void const *p)
- #endif
- }
- 
--// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
--// unsigned 16-bit integers in b for equality.
--// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
-+// Compare packed 16-bit integers in a and b for equality, and store the results
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
- FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-@@ -3416,16 +3076,17 @@ FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
- }
- 
- // Compare packed 32-bit integers in a and b for equality, and store the results
--// in dst
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
- FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u32(
-         vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
--// unsigned 8-bit integers in b for equality.
--// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
-+// Compare packed 8-bit integers in a and b for equality, and store the results
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
- FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -3434,7 +3095,7 @@ FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for equality, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
- FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3452,7 +3113,7 @@ FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for equality, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
- FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
-@@ -3460,7 +3121,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for greater-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
- FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3482,7 +3143,7 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for greater-than-or-equal, store the result in the lower element of dst,
- // and copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
- FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3500,39 +3161,27 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
- #endif
- }
- 
--// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
--// in b for greater than.
--//
--//   r0 := (a0 > b0) ? 0xffff : 0x0
--//   r1 := (a1 > b1) ? 0xffff : 0x0
--//   ...
--//   r7 := (a7 > b7) ? 0xffff : 0x0
--//
--// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b for greater-than, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
- FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-         vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
--// in b for greater than.
--// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b for greater-than, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
- FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u32(
-         vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
--// in b for greater than.
--//
--//   r0 := (a0 > b0) ? 0xff : 0x0
--//   r1 := (a1 > b1) ? 0xff : 0x0
--//   ...
--//   r15 := (a15 > b15) ? 0xff : 0x0
--//
--// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
-+// Compare packed signed 8-bit integers in a and b for greater-than, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
- FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -3541,7 +3190,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for greater-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
- FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3563,7 +3212,7 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for greater-than, store the result in the lower element of dst, and copy
- // the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
- FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3583,7 +3232,7 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for less-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
- FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3605,7 +3254,7 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for less-than-or-equal, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
- FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3623,34 +3272,30 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
- #endif
- }
- 
--// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
--// in b for less than.
--//
--//   r0 := (a0 < b0) ? 0xffff : 0x0
--//   r1 := (a1 < b1) ? 0xffff : 0x0
--//   ...
--//   r7 := (a7 < b7) ? 0xffff : 0x0
--//
--// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b for less-than, and store the
-+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
-+// order of the operands switched.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
- FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-         vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--
--// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
--// in b for less than.
--// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b for less-than, and store the
-+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
-+// order of the operands switched.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
- FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u32(
-         vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
--// in b for lesser than.
--// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
-+// Compare packed signed 8-bit integers in a and b for less-than, and store the
-+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
-+// order of the operands switched.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
- FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -3659,7 +3304,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for less-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
- FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3681,7 +3326,7 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for less-than, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
- FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3700,7 +3345,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
- FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3718,7 +3363,7 @@ FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-equal, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
- FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
-@@ -3726,7 +3371,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-greater-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
- FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3751,7 +3396,7 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-greater-than-or-equal, store the result in the lower element of
- // dst, and copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
- FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
-@@ -3759,7 +3404,7 @@ FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-greater-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
- FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3784,7 +3429,7 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-greater-than, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
- FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
-@@ -3792,7 +3437,7 @@ FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-less-than-or-equal, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
- FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3817,7 +3462,7 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-less-than-or-equal, store the result in the lower element of dst,
- // and copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
- FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
-@@ -3825,7 +3470,7 @@ FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // for not-less-than, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
- FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3850,7 +3495,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b for not-less-than, store the result in the lower element of dst, and copy
- // the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
- FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
-@@ -3858,7 +3503,7 @@ FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // to see if neither is NaN, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
- FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3890,7 +3535,7 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b to see if neither is NaN, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
- FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3912,7 +3557,7 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b
- // to see if either is NaN, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
- FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3945,7 +3590,7 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b to see if either is NaN, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
- FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3967,7 +3612,7 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for greater-than-or-equal, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
- FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3982,7 +3627,7 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for greater-than, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
- FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -3997,7 +3642,7 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for less-than-or-equal, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
- FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4012,7 +3657,7 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for less-than, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
- FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4027,7 +3672,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for equality, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
- FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4048,7 +3693,7 @@ FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
- 
- // Compare the lower double-precision (64-bit) floating-point element in a and b
- // for not-equal, and return the boolean result (0 or 1).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
- FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
- {
-     return !_mm_comieq_sd(a, b);
-@@ -4056,14 +3701,7 @@ FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
- 
- // Convert packed signed 32-bit integers in a to packed double-precision
- // (64-bit) floating-point elements, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*32
--//     m := j*64
--//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
- FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
- {
- #if defined(__aarch64__)
-@@ -4076,9 +3714,9 @@ FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
- #endif
- }
- 
--// Converts the four signed 32-bit integer values of a to single-precision,
--// floating-point values
--// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
-+// Convert packed signed 32-bit integers in a to packed single-precision
-+// (32-bit) floating-point elements, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
- FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
- {
-     return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-@@ -4086,14 +3724,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
- 
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//      i := 32*j
--//      k := 64*j
--//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
- FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
- {
- // vrnd32xq_f64 not supported on clang
-@@ -4112,14 +3743,7 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
- 
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//      i := 32*j
--//      k := 64*j
--//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
- FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
- {
-     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-@@ -4132,15 +3756,7 @@ FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed single-precision (32-bit) floating-point elements, and store the
- // results in dst.
--//
--//   FOR j := 0 to 1
--//     i := 32*j
--//     k := 64*j
--//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
--//   ENDFOR
--//   dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
- FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4155,14 +3771,7 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
- 
- // Convert packed signed 32-bit integers in a to packed double-precision
- // (64-bit) floating-point elements, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*32
--//     m := j*64
--//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
- FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
- {
- #if defined(__aarch64__)
-@@ -4175,15 +3784,9 @@ FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
- #endif
- }
- 
--// Converts the four single-precision, floating-point values of a to signed
--// 32-bit integer values.
--//
--//   r0 := (int) a0
--//   r1 := (int) a1
--//   r2 := (int) a2
--//   r3 := (int) a3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-+// Convert packed single-precision (32-bit) floating-point elements in a to
-+// packed 32-bit integers, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
- // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
- // does not support! It is supported on ARMv8-A however.
- FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-@@ -4240,14 +3843,7 @@ FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
- // Convert packed single-precision (32-bit) floating-point elements in a to
- // packed double-precision (64-bit) floating-point elements, and store the
- // results in dst.
--//
--//   FOR j := 0 to 1
--//     i := 64*j
--//     k := 32*j
--//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
- FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
- {
- #if defined(__aarch64__)
-@@ -4261,10 +3857,7 @@ FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
- }
- 
- // Copy the lower double-precision (64-bit) floating-point element of a to dst.
--//
--//   dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
- FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4276,10 +3869,7 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 32-bit integer, and store the result in dst.
--//
--//   dst[31:0] := Convert_FP64_To_Int32(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
- FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4293,10 +3883,7 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
- FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4310,17 +3897,14 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
- #define _mm_cvtsd_si64x _mm_cvtsd_si64
- 
- // Convert the lower double-precision (64-bit) floating-point element in b to a
- // single-precision (32-bit) floating-point element, store the result in the
- // lower element of dst, and copy the upper 3 packed elements from a to the
- // upper elements of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
- FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4334,33 +3918,27 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
- }
- 
- // Copy the lower 32-bit integer in a to dst.
--//
--//   dst[31:0] := a[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
- FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
- {
-     return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
- }
- 
- // Copy the lower 64-bit integer in a to dst.
--//
--//   dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
- FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
- {
-     return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
- }
- 
- // Copy the lower 64-bit integer in a to dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
- #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
- 
- // Convert the signed 32-bit integer b to a double-precision (64-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
- FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
- {
- #if defined(__aarch64__)
-@@ -4374,21 +3952,12 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
- }
- 
- // Copy the lower 64-bit integer in a to dst.
--//
--//   dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
- #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
- 
--// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
--// zero extending the upper bits.
--//
--//   r0 := a
--//   r1 := 0x0
--//   r2 := 0x0
--//   r3 := 0x0
--//
--// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
-+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
-+// elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
- FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
- {
-     return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-@@ -4397,7 +3966,7 @@ FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
- // Convert the signed 64-bit integer b to a double-precision (64-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
- FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
- {
- #if defined(__aarch64__)
-@@ -4410,11 +3979,9 @@ FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
- #endif
- }
- 
--// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
--// zero extending the upper bits.
--//
--//   r0 := a
--//   r1 := 0x0
-+// Copy 64-bit integer a to the lower element of dst, and zero the upper
-+// element.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
- FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
- {
-     return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-@@ -4422,24 +3989,20 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
- 
- // Copy 64-bit integer a to the lower element of dst, and zero the upper
- // element.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
- #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
- 
- // Convert the signed 64-bit integer b to a double-precision (64-bit)
- // floating-point element, store the result in the lower element of dst, and
- // copy the upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
- #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
- 
- // Convert the lower single-precision (32-bit) floating-point element in b to a
- // double-precision (64-bit) floating-point element, store the result in the
- // lower element of dst, and copy the upper element from a to the upper element
- // of dst.
--//
--//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
--//   dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
- FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
- {
-     double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-@@ -4454,7 +4017,7 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
- 
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
- FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
- {
-     double a0 = ((double *) &a)[0];
-@@ -4464,7 +4027,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
- 
- // Convert packed double-precision (64-bit) floating-point elements in a to
- // packed 32-bit integers with truncation, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
- FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
- {
-     double a0 = ((double *) &a)[0];
-@@ -4473,9 +4036,9 @@ FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
-     return vreinterpret_m64_s32(vld1_s32(data));
- }
- 
--// Converts the four single-precision, floating-point values of a to signed
--// 32-bit integer values using truncate.
--// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
-+// Convert packed single-precision (32-bit) floating-point elements in a to
-+// packed 32-bit integers with truncation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
- FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
- {
-     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-@@ -4483,10 +4046,7 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 32-bit integer with truncation, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
- FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
- {
-     double ret = *((double *) &a);
-@@ -4495,10 +4055,7 @@ FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer with truncation, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
- FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -4511,21 +4068,12 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
- 
- // Convert the lower double-precision (64-bit) floating-point element in a to a
- // 64-bit integer with truncation, and store the result in dst.
--//
--//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
- #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
- 
- // Divide packed double-precision (64-bit) floating-point elements in a by
- // packed elements in b, and store the results in dst.
--//
--//  FOR j := 0 to 1
--//    i := 64*j
--//    dst[i+63:i] := a[i+63:i] / b[i+63:i]
--//  ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
- FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4545,7 +4093,7 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
- // lower double-precision (64-bit) floating-point element in b, store the result
- // in the lower element of dst, and copy the upper element from a to the upper
- // element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
- FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4558,16 +4106,16 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
- #endif
- }
- 
--// Extracts the selected signed or unsigned 16-bit integer from a and zero
--// extends.
--// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-+// Extract a 16-bit integer from a, selected with imm8, and store the result in
-+// the lower element of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
- // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
- #define _mm_extract_epi16(a, imm) \
-     vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
- 
--// Inserts the least significant 16 bits of b into the selected 16-bit integer
--// of a.
--// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-+// Copy a to dst, and insert the 16-bit integer i into dst at the location
-+// specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
- // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
- //                                       __constrange(0,8) int imm)
- #define _mm_insert_epi16(a, b, imm)                                  \
-@@ -4576,12 +4124,10 @@ FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
-             vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-     })
- 
--// Loads two double-precision from 16-byte aligned memory, floating-point
--// values.
--//
--//   dst[127:0] := MEM[mem_addr+127:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
-+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
-+// boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
- FORCE_INLINE __m128d _mm_load_pd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4595,21 +4141,13 @@ FORCE_INLINE __m128d _mm_load_pd(const double *p)
- 
- // Load a double-precision (64-bit) floating-point element from memory into both
- // elements of dst.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
- #define _mm_load_pd1 _mm_load1_pd
- 
- // Load a double-precision (64-bit) floating-point element from memory into the
- // lower of dst, and zero the upper element. mem_addr does not need to be
- // aligned on any particular boundary.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
- FORCE_INLINE __m128d _mm_load_sd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4621,8 +4159,9 @@ FORCE_INLINE __m128d _mm_load_sd(const double *p)
- #endif
- }
- 
--// Loads 128-bit value. :
--// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
-+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
-+// on a 16-byte boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
- FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
- {
-     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-@@ -4630,11 +4169,7 @@ FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
- 
- // Load a double-precision (64-bit) floating-point element from memory into both
- // elements of dst.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
- FORCE_INLINE __m128d _mm_load1_pd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4647,11 +4182,7 @@ FORCE_INLINE __m128d _mm_load1_pd(const double *p)
- // Load a double-precision (64-bit) floating-point element from memory into the
- // upper element of dst, and copy the lower element from a to dst. mem_addr does
- // not need to be aligned on any particular boundary.
--//
--//   dst[63:0] := a[63:0]
--//   dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
- FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
- {
- #if defined(__aarch64__)
-@@ -4664,7 +4195,7 @@ FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
- }
- 
- // Load 64-bit integer from memory into the first element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
- FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
- {
-     /* Load the lower 64 bits of the value pointed to by p into the
-@@ -4677,11 +4208,7 @@ FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
- // Load a double-precision (64-bit) floating-point element from memory into the
- // lower element of dst, and copy the upper element from a to dst. mem_addr does
- // not need to be aligned on any particular boundary.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
- FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
- {
- #if defined(__aarch64__)
-@@ -4697,11 +4224,7 @@ FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
- // Load 2 double-precision (64-bit) floating-point elements from memory into dst
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
--//   dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
- FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
- {
- #if defined(__aarch64__)
-@@ -4714,39 +4237,32 @@ FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
- }
- 
- // Loads two double-precision from unaligned memory, floating-point values.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
- FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
- {
-     return _mm_load_pd(p);
- }
- 
--// Loads 128-bit value. :
--// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
-+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
-+// be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
- FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
- {
-     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
- }
- 
- // Load unaligned 32-bit integer from memory into the first element of dst.
--//
--//   dst[31:0] := MEM[mem_addr+31:mem_addr]
--//   dst[MAX:32] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
- FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
- {
-     return vreinterpretq_m128i_s32(
-         vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
- }
- 
--// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
--// integers from b.
--//
--//   r0 := (a0 * b0) + (a1 * b1)
--//   r1 := (a2 * b2) + (a3 * b3)
--//   r2 := (a4 * b4) + (a5 * b5)
--//   r3 := (a6 * b6) + (a7 * b7)
--// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
-+// Multiply packed signed 16-bit integers in a and b, producing intermediate
-+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
-+// 32-bit integers, and pack the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
- FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
- {
-     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-@@ -4771,7 +4287,7 @@ FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
- // (elements are not stored when the highest bit is not set in the corresponding
- // element) and a non-temporal memory hint. mem_addr does not need to be aligned
- // on any particular boundary.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
- FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
- {
-     int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
-@@ -4782,18 +4298,18 @@ FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
-     vst1q_s8((int8_t *) mem_addr, masked);
- }
- 
--// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
--// signed 16-bit integers from b.
--// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b, and store packed maximum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
- FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-         vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
--// 16 unsigned 8-bit integers from b.
--// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
-+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
- FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -4802,7 +4318,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b,
- // and store packed maximum values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
- FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4830,7 +4346,7 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b, store the maximum value in the lower element of dst, and copy the upper
- // element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
- FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4843,18 +4359,18 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
- #endif
- }
- 
--// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
--// signed 16-bit integers from b.
--// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
-+// Compare packed signed 16-bit integers in a and b, and store packed minimum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
- FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
--// 16 unsigned 8-bit integers from b.
--// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
-+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
- FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -4863,7 +4379,7 @@ FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
- 
- // Compare packed double-precision (64-bit) floating-point elements in a and b,
- // and store packed minimum values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
- FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4890,7 +4406,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
- // Compare the lower double-precision (64-bit) floating-point elements in a and
- // b, store the minimum value in the lower element of dst, and copy the upper
- // element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
- FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -4905,11 +4421,7 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
- 
- // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
- // upper element.
--//
--//   dst[63:0] := a[63:0]
--//   dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
- FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
- {
-     return vreinterpretq_m128i_s64(
-@@ -4919,11 +4431,7 @@ FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
- // Move the lower double-precision (64-bit) floating-point element from b to the
- // lower element of dst, and copy the upper element from a to the upper element
- // of dst.
--//
--//   dst[63:0] := b[63:0]
--//   dst[127:64] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
- FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
- {
-     return vreinterpretq_m128d_f32(
-@@ -4931,10 +4439,9 @@ FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
-                      vget_high_f32(vreinterpretq_f32_m128d(a))));
- }
- 
--// NEON does not provide a version of this function.
--// Creates a 16-bit mask from the most significant bits of the 16 signed or
--// unsigned 8-bit integers in a and zero extends the upper bits.
--// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-+// Create mask from the most significant bit of each 8-bit element in a, and
-+// store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
- FORCE_INLINE int _mm_movemask_epi8(__m128i a)
- {
-     // Use increasingly wide shifts+adds to collect the sign bits
-@@ -5017,7 +4524,7 @@ FORCE_INLINE int _mm_movemask_epi8(__m128i a)
- 
- // Set each bit of mask dst based on the most significant bit of the
- // corresponding packed double-precision (64-bit) floating-point element in a.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
- FORCE_INLINE int _mm_movemask_pd(__m128d a)
- {
-     uint64x2_t input = vreinterpretq_u64_m128d(a);
-@@ -5026,10 +4533,7 @@ FORCE_INLINE int _mm_movemask_pd(__m128d a)
- }
- 
- // Copy the lower 64-bit integer in a to dst.
--//
--//   dst[63:0] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
- FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
- {
-     return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
-@@ -5037,11 +4541,7 @@ FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
- 
- // Copy the 64-bit integer a to the lower element of dst, and zero the upper
- // element.
--//
--//   dst[63:0] := a[63:0]
--//   dst[127:64] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
- FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
- {
-     return vreinterpretq_m128i_s64(
-@@ -5050,9 +4550,7 @@ FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
- 
- // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
- // a and b, and store the unsigned 64-bit results in dst.
--//
--//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
--//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
- FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
- {
-     // vmull_u32 upcasts instead of masking, so we downcast.
-@@ -5063,7 +4561,7 @@ FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
- 
- // Multiply packed double-precision (64-bit) floating-point elements in a and b,
- // and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
- FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -5082,7 +4580,7 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
- // Multiply the lower double-precision (64-bit) floating-point element in a and
- // b, store the result in the lower element of dst, and copy the upper element
- // from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
- FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_mul_pd(a, b));
-@@ -5090,25 +4588,17 @@ FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
- 
- // Multiply the low unsigned 32-bit integers from a and b, and store the
- // unsigned 64-bit result in dst.
--//
--//   dst[63:0] := a[31:0] * b[31:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
- FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_u64(vget_low_u64(
-         vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
- }
- 
--// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
--// integers from b.
--//
--//   r0 := (a0 * b0)[31:16]
--//   r1 := (a1 * b1)[31:16]
--//   ...
--//   r7 := (a7 * b7)[31:16]
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
-+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
-+// 32-bit integers, and store the high 16 bits of the intermediate integers in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
- FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
- {
-     /* FIXME: issue with large values because of result saturation */
-@@ -5129,7 +4619,7 @@ FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
- // Multiply the packed unsigned 16-bit integers in a and b, producing
- // intermediate 32-bit integers, and store the high 16 bits of the intermediate
- // integers in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
- FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
- {
-     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
-@@ -5151,15 +4641,9 @@ FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
- #endif
- }
- 
--// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
--// unsigned 16-bit integers from b.
--//
--//   r0 := (a0 * b0)[15:0]
--//   r1 := (a1 * b1)[15:0]
--//   ...
--//   r7 := (a7 * b7)[15:0]
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
-+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
-+// integers, and store the low 16 bits of the intermediate integers in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
- FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-@@ -5168,27 +4652,25 @@ FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
- 
- // Compute the bitwise OR of packed double-precision (64-bit) floating-point
- // elements in a and b, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
- FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
- {
-     return vreinterpretq_m128d_s64(
-         vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
- }
- 
--// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
--//
--//   r := a | b
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
-+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
-+// and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
- FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-         vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
--// saturates.
--// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
-+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-+// using signed saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
- FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -5196,19 +4678,9 @@ FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-                     vqmovn_s16(vreinterpretq_s16_m128i(b))));
- }
- 
--// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
--// and saturates.
--//
--//   r0 := SignedSaturate(a0)
--//   r1 := SignedSaturate(a1)
--//   r2 := SignedSaturate(a2)
--//   r3 := SignedSaturate(a3)
--//   r4 := SignedSaturate(b0)
--//   r5 := SignedSaturate(b1)
--//   r6 := SignedSaturate(b2)
--//   r7 := SignedSaturate(b3)
--//
--// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
-+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-+// using signed saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
- FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-@@ -5216,19 +4688,9 @@ FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-                      vqmovn_s32(vreinterpretq_s32_m128i(b))));
- }
- 
--// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
--// integers and saturates.
--//
--//   r0 := UnsignedSaturate(a0)
--//   r1 := UnsignedSaturate(a1)
--//   ...
--//   r7 := UnsignedSaturate(a7)
--//   r8 := UnsignedSaturate(b0)
--//   r9 := UnsignedSaturate(b1)
--//   ...
--//   r15 := UnsignedSaturate(b7)
--//
--// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
-+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
-+// using unsigned saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
- FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -5241,6 +4703,7 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
- // 'yield' instruction isn't a good fit because it's effectively a nop on most
- // Arm cores. Experience with several databases has shown has shown an 'isb' is
- // a reasonable approximation.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
- FORCE_INLINE void _mm_pause()
- {
-     __asm__ __volatile__("isb\n");
-@@ -5250,15 +4713,15 @@ FORCE_INLINE void _mm_pause()
- // b, then horizontally sum each consecutive 8 differences to produce two
- // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
- // 16 bits of 64-bit elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
- FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
- {
-     uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
-     return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
- }
- 
--// Sets the 8 signed 16-bit integer values.
--// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
-+// Set packed 16-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
- FORCE_INLINE __m128i _mm_set_epi16(short i7,
-                                    short i6,
-                                    short i5,
-@@ -5272,33 +4735,31 @@ FORCE_INLINE __m128i _mm_set_epi16(short i7,
-     return vreinterpretq_m128i_s16(vld1q_s16(data));
- }
- 
--// Sets the 4 signed 32-bit integer values.
--// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-+// Set packed 32-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
- FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
- {
-     int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
-     return vreinterpretq_m128i_s32(vld1q_s32(data));
- }
- 
--// Returns the __m128i structure with its two 64-bit integer values
--// initialized to the values of the two 64-bit integers passed in.
--// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-+// Set packed 64-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
- FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
- {
-     return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
- }
- 
--// Returns the __m128i structure with its two 64-bit integer values
--// initialized to the values of the two 64-bit integers passed in.
--// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-+// Set packed 64-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
- FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
- {
-     return vreinterpretq_m128i_s64(
-         vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
- }
- 
--// Sets the 16 signed 8-bit integer values.
--// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
-+// Set packed 8-bit integers in dst with the supplied values.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
- FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
-                                   signed char b14,
-                                   signed char b13,
-@@ -5326,7 +4787,7 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
- 
- // Set packed double-precision (64-bit) floating-point elements in dst with the
- // supplied values.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
- FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
- {
-     double ALIGN_STRUCT(16) data[2] = {e0, e1};
-@@ -5339,12 +4800,12 @@ FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
- 
- // Broadcast double-precision (64-bit) floating-point value a to all elements of
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
- #define _mm_set_pd1 _mm_set1_pd
- 
- // Copy double-precision (64-bit) floating-point element a to the lower element
- // of dst, and zero the upper element.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
- FORCE_INLINE __m128d _mm_set_sd(double a)
- {
- #if defined(__aarch64__)
-@@ -5354,54 +4815,36 @@ FORCE_INLINE __m128d _mm_set_sd(double a)
- #endif
- }
- 
--// Sets the 8 signed 16-bit integer values to w.
--//
--//   r0 := w
--//   r1 := w
--//   ...
--//   r7 := w
--//
--// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
-+// Broadcast 16-bit integer a to all all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
- FORCE_INLINE __m128i _mm_set1_epi16(short w)
- {
-     return vreinterpretq_m128i_s16(vdupq_n_s16(w));
- }
- 
--// Sets the 4 signed 32-bit integer values to i.
--//
--//   r0 := i
--//   r1 := i
--//   r2 := i
--//   r3 := I
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-+// Broadcast 32-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
- FORCE_INLINE __m128i _mm_set1_epi32(int _i)
- {
-     return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
- }
- 
--// Sets the 2 signed 64-bit integer values to i.
--// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
-+// Broadcast 64-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
- FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
- {
-     return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
- }
- 
--// Sets the 2 signed 64-bit integer values to i.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
-+// Broadcast 64-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
- FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
- {
-     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
- }
- 
--// Sets the 16 signed 8-bit integer values to b.
--//
--//   r0 := b
--//   r1 := b
--//   ...
--//   r15 := b
--//
--// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
-+// Broadcast 8-bit integer a to all elements of dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
- FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
- {
-     return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-@@ -5409,7 +4852,7 @@ FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
- 
- // Broadcast double-precision (64-bit) floating-point value a to all elements of
- // dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
- FORCE_INLINE __m128d _mm_set1_pd(double d)
- {
- #if defined(__aarch64__)
-@@ -5419,13 +4862,8 @@ FORCE_INLINE __m128d _mm_set1_pd(double d)
- #endif
- }
- 
--// Sets the 8 signed 16-bit integer values in reverse order.
--//
--// Return Value
--//   r0 := w0
--//   r1 := w1
--//   ...
--//   r7 := w7
-+// Set packed 16-bit integers in dst with the supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
- FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-                                     short w1,
-                                     short w2,
-@@ -5439,8 +4877,8 @@ FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-     return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
- }
- 
--// Sets the 4 signed 32-bit integer values in reverse order
--// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
-+// Set packed 32-bit integers in dst with the supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
- FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
- {
-     int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-@@ -5448,14 +4886,14 @@ FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
- }
- 
- // Set packed 64-bit integers in dst with the supplied values in reverse order.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
- FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
- {
-     return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
- }
- 
--// Sets the 16 signed 8-bit integer values in reverse order.
--// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
-+// Set packed 8-bit integers in dst with the supplied values in reverse order.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
- FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
-                                    signed char b1,
-                                    signed char b2,
-@@ -5483,14 +4921,14 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
- 
- // Set packed double-precision (64-bit) floating-point elements in dst with the
- // supplied values in reverse order.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
- FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
- {
-     return _mm_set_pd(e0, e1);
- }
- 
- // Return vector of type __m128d with all elements set to zero.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
- FORCE_INLINE __m128d _mm_setzero_pd(void)
- {
- #if defined(__aarch64__)
-@@ -5500,15 +4938,16 @@ FORCE_INLINE __m128d _mm_setzero_pd(void)
- #endif
- }
- 
--// Sets the 128-bit value to zero
--// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-+// Return vector of type __m128i with all elements set to zero.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
- FORCE_INLINE __m128i _mm_setzero_si128(void)
- {
-     return vreinterpretq_m128i_s32(vdupq_n_s32(0));
- }
- 
--// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
--// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-+// Shuffle 32-bit integers in a using the control in imm8, and store the results
-+// in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
- // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
- //                                        __constrange(0,255) int imm)
- #ifdef _sse2neon_shuffle
-@@ -5577,11 +5016,7 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
- 
- // Shuffle double-precision (64-bit) floating-point elements using the control
- // in imm8, and store the results in dst.
--//
--//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
--//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
- #ifdef _sse2neon_shuffle
- #define _mm_shuffle_pd(a, b, imm8)                                            \
-     vreinterpretq_m128d_s64(                                                  \
-@@ -5627,17 +5062,7 @@ FORCE_INLINE __m128i _mm_setzero_si128(void)
- 
- // Shift packed 16-bit integers in a left by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF count[63:0] > 15
--//       dst[i+15:i] := 0
--//     ELSE
--//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
- FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5650,17 +5075,7 @@ FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
- 
- // Shift packed 32-bit integers in a left by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF count[63:0] > 31
--//       dst[i+31:i] := 0
--//     ELSE
--//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
- FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5673,17 +5088,7 @@ FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
- 
- // Shift packed 64-bit integers in a left by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     IF count[63:0] > 63
--//       dst[i+63:i] := 0
--//     ELSE
--//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
- FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5696,17 +5101,7 @@ FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
- 
- // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF imm8[7:0] > 15
--//       dst[i+15:i] := 0
--//     ELSE
--//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
- FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
- {
-     if (_sse2neon_unlikely(imm & ~15))
-@@ -5717,17 +5112,7 @@ FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
- 
- // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF imm8[7:0] > 31
--//       dst[i+31:i] := 0
--//     ELSE
--//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
- FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
- {
-     if (_sse2neon_unlikely(imm & ~31))
-@@ -5738,17 +5123,7 @@ FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
- 
- // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     IF imm8[7:0] > 63
--//       dst[i+63:i] := 0
--//     ELSE
--//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
- FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
- {
-     if (_sse2neon_unlikely(imm & ~63))
-@@ -5759,14 +5134,7 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
- 
- // Shift a left by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--//
--//   tmp := imm8[7:0]
--//   IF tmp > 15
--//     tmp := 16
--//   FI
--//   dst[127:0] := a[127:0] << (tmp*8)
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
- #define _mm_slli_si128(a, imm)                                         \
-     __extension__({                                                    \
-         int8x16_t ret;                                                 \
-@@ -5782,7 +5150,7 @@ FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
- 
- // Compute the square root of packed double-precision (64-bit) floating-point
- // elements in a, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
- FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -5797,7 +5165,7 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
- // Compute the square root of the lower double-precision (64-bit) floating-point
- // element in b, store the result in the lower element of dst, and copy the
- // upper element from a to the upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
- FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -5809,17 +5177,7 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
- 
- // Shift packed 16-bit integers in a right by count while shifting in sign bits,
- // and store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF count[63:0] > 15
--//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
--//     ELSE
--//       dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
--//     FI
--//  ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
- FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
- {
-     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-@@ -5830,17 +5188,7 @@ FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
- 
- // Shift packed 32-bit integers in a right by count while shifting in sign bits,
- // and store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF count[63:0] > 31
--//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
--//     ELSE
--//       dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
--//     FI
--//  ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
- FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
- {
-     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-@@ -5851,17 +5199,7 @@ FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
- 
- // Shift packed 16-bit integers in a right by imm8 while shifting in sign
- // bits, and store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF imm8[7:0] > 15
--//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
--//     ELSE
--//       dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
- FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
- {
-     const int count = (imm & ~15) ? 15 : imm;
-@@ -5870,17 +5208,7 @@ FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
- 
- // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
- // and store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF imm8[7:0] > 31
--//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
--//     ELSE
--//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
- // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
- #define _mm_srai_epi32(a, imm)                                               \
-     __extension__({                                                          \
-@@ -5899,17 +5227,7 @@ FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
- 
- // Shift packed 16-bit integers in a right by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF count[63:0] > 15
--//       dst[i+15:i] := 0
--//     ELSE
--//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
- FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5922,17 +5240,7 @@ FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
- 
- // Shift packed 32-bit integers in a right by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF count[63:0] > 31
--//       dst[i+31:i] := 0
--//     ELSE
--//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
- FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5945,17 +5253,7 @@ FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
- 
- // Shift packed 64-bit integers in a right by count while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     IF count[63:0] > 63
--//       dst[i+63:i] := 0
--//     ELSE
--//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
- FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- {
-     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-@@ -5968,17 +5266,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- 
- // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     IF imm8[7:0] > 15
--//       dst[i+15:i] := 0
--//     ELSE
--//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
- #define _mm_srli_epi16(a, imm)                                               \
-     __extension__({                                                          \
-         __m128i ret;                                                         \
-@@ -5993,17 +5281,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- 
- // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     IF imm8[7:0] > 31
--//       dst[i+31:i] := 0
--//     ELSE
--//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
- // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
- #define _mm_srli_epi32(a, imm)                                               \
-     __extension__({                                                          \
-@@ -6019,17 +5297,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- 
- // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
- // store the results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     IF imm8[7:0] > 63
--//       dst[i+63:i] := 0
--//     ELSE
--//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
- #define _mm_srli_epi64(a, imm)                                               \
-     __extension__({                                                          \
-         __m128i ret;                                                         \
-@@ -6044,14 +5312,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- 
- // Shift a right by imm8 bytes while shifting in zeros, and store the results in
- // dst.
--//
--//   tmp := imm8[7:0]
--//   IF tmp > 15
--//     tmp := 16
--//   FI
--//   dst[127:0] := a[127:0] >> (tmp*8)
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
- #define _mm_srli_si128(a, imm)                                       \
-     __extension__({                                                  \
-         int8x16_t ret;                                               \
-@@ -6066,7 +5327,7 @@ FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
- // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
- // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
- // or a general-protection exception may be generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
- FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6079,7 +5340,7 @@ FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
- // Store the lower double-precision (64-bit) floating-point element from a into
- // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
- FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6095,7 +5356,7 @@ FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
- 
- // Store the lower double-precision (64-bit) floating-point element from a into
- // memory. mem_addr does not need to be aligned on any particular boundary.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
- FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6105,8 +5366,9 @@ FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
- #endif
- }
- 
--// Stores four 32-bit integer values as (as a __m128i value) at the address p.
--// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
-+// on a 16-byte boundary or a general-protection exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
- FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
- {
-     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-@@ -6115,15 +5377,12 @@ FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
- // Store the lower double-precision (64-bit) floating-point element from a into
- // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
- // boundary or a general-protection exception may be generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
- #define _mm_store1_pd _mm_store_pd1
- 
- // Store the upper double-precision (64-bit) floating-point element from a into
- // memory.
--//
--//   MEM[mem_addr+63:mem_addr] := a[127:64]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
- FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6133,8 +5392,8 @@ FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
- #endif
- }
- 
--// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
--// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-+// Store 64-bit integer from the first element of a into memory.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
- FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
- {
-     vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
-@@ -6142,10 +5401,7 @@ FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
- 
- // Store the lower double-precision (64-bit) floating-point element from a into
- // memory.
--//
--//   MEM[mem_addr+63:mem_addr] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
- FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
- {
- #if defined(__aarch64__)
-@@ -6158,11 +5414,7 @@ FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
- // Store 2 double-precision (64-bit) floating-point elements from a into memory
- // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--//   MEM[mem_addr+63:mem_addr] := a[127:64]
--//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
- FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
- {
-     float32x4_t f = vreinterpretq_f32_m128d(a);
-@@ -6172,21 +5424,23 @@ FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
- // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
- // elements) from a into memory. mem_addr does not need to be aligned on any
- // particular boundary.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
- FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
- {
-     _mm_store_pd(mem_addr, a);
- }
- 
--// Stores 128-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
-+// Store 128-bits of integer data from a into memory. mem_addr does not need to
-+// be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
- FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
- {
-     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
- }
- 
--// Stores 32-bits of integer data a at the address p.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
-+// Store 32-bit integer from the first element of a into memory. mem_addr does
-+// not need to be aligned on any particular boundary.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
- FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
- {
-     vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
-@@ -6196,7 +5450,7 @@ FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
- // elements) from a into memory using a non-temporal memory hint. mem_addr must
- // be aligned on a 16-byte boundary or a general-protection exception may be
- // generated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
- FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -6208,10 +5462,10 @@ FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
- #endif
- }
- 
--// Stores the data in a to the address p without polluting the caches.  If the
--// cache line containing address p is already in the cache, the cache will be
--// updated.
--// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-+// Store 128-bits of integer data from a into memory using a non-temporal memory
-+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
-+// exception may be generated.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
- FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -6224,7 +5478,7 @@ FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
- // Store 32-bit integer a into memory using a non-temporal hint to minimize
- // cache pollution. If the cache line containing address mem_addr is already in
- // the cache, the cache will be updated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
- FORCE_INLINE void _mm_stream_si32(int *p, int a)
- {
-     vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
-@@ -6233,7 +5487,7 @@ FORCE_INLINE void _mm_stream_si32(int *p, int a)
- // Store 64-bit integer a into memory using a non-temporal hint to minimize
- // cache pollution. If the cache line containing address mem_addr is already in
- // the cache, the cache will be updated.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
- FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
- {
-     vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
-@@ -6241,32 +5495,25 @@ FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
- 
- // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
- // store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
- FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-         vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
--// unsigned 32-bit integers of a.
--//
--//   r0 := a0 - b0
--//   r1 := a1 - b1
--//   r2 := a2 - b2
--//   r3 := a3 - b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
-+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
- FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-         vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
--// and store the results in dst.
--//    r0 := a0 - b0
--//    r1 := a1 - b1
-+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
- FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s64(
-@@ -6275,7 +5522,7 @@ FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
- 
- // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
- // store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
- FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -6285,13 +5532,7 @@ FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
- // Subtract packed double-precision (64-bit) floating-point elements in b from
- // packed double-precision (64-bit) floating-point elements in a, and store the
- // results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*64
--//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
--//   ENDFOR
--//
--//  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
-+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
- FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6311,71 +5552,50 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
- // the lower double-precision (64-bit) floating-point element in a, store the
- // result in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
- FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_sub_pd(a, b));
- }
- 
- // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
--//
--//   dst[63:0] := a[63:0] - b[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
- FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s64(
-         vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
- }
- 
--// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
--// of a and saturates.
--//
--//   r0 := SignedSaturate(a0 - b0)
--//   r1 := SignedSaturate(a1 - b1)
--//   ...
--//   r7 := SignedSaturate(a7 - b7)
--//
--// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
-+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
-+// using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
- FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s16(
-         vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
- }
- 
--// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
--// of a and saturates.
--//
--//   r0 := SignedSaturate(a0 - b0)
--//   r1 := SignedSaturate(a1 - b1)
--//   ...
--//   r15 := SignedSaturate(a15 - b15)
--//
--// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
-+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
-+// using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
- FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-         vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
- }
- 
--// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
--// integers of a and saturates..
--// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
-+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
-+// integers in a using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
- FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-         vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
- }
- 
--// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
--// integers of a and saturates.
--//
--//   r0 := UnsignedSaturate(a0 - b0)
--//   r1 := UnsignedSaturate(a1 - b1)
--//   ...
--//   r15 := UnsignedSaturate(a15 - b15)
--//
--// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
-+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
-+// integers in a using saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
- FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u8(
-@@ -6390,7 +5610,7 @@ FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
- #define _mm_ucomineq_sd _mm_comineq_sd
- 
- // Return vector of type __m128d with undefined elements.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
- FORCE_INLINE __m128d _mm_undefined_pd(void)
- {
- #if defined(__GNUC__) || defined(__clang__)
-@@ -6404,19 +5624,9 @@ FORCE_INLINE __m128d _mm_undefined_pd(void)
- #endif
- }
- 
--// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
--// upper 4 signed or unsigned 16-bit integers in b.
--//
--//   r0 := a4
--//   r1 := b4
--//   r2 := a5
--//   r3 := b5
--//   r4 := a6
--//   r5 := b6
--//   r6 := a7
--//   r7 := b7
--//
--// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
-+// Unpack and interleave 16-bit integers from the high half of a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
- FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6430,9 +5640,9 @@ FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
- #endif
- }
- 
--// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
--// upper 2 signed or unsigned 32-bit integers in b.
--// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
-+// Unpack and interleave 32-bit integers from the high half of a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
- FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6446,30 +5656,24 @@ FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
- #endif
- }
- 
--// Interleaves the upper signed or unsigned 64-bit integer in a with the
--// upper signed or unsigned 64-bit integer in b.
--//
--//   r0 := a1
--//   r1 := b1
-+// Unpack and interleave 64-bit integers from the high half of a and b, and
-+// store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
- FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
- {
-+#if defined(__aarch64__)
-+    return vreinterpretq_m128i_s64(
-+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-+#else
-     int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
-     int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
-     return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-+#endif
- }
- 
--// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
--// 8 signed or unsigned 8-bit integers in b.
--//
--//   r0 := a8
--//   r1 := b8
--//   r2 := a9
--//   r3 := b9
--//   ...
--//   r14 := a15
--//   r15 := b15
--//
--// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
-+// Unpack and interleave 8-bit integers from the high half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
- FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6487,15 +5691,7 @@ FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
- 
- // Unpack and interleave double-precision (64-bit) floating-point elements from
- // the high half of a and b, and store the results in dst.
--//
--//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
--//     dst[63:0] := src1[127:64]
--//     dst[127:64] := src2[127:64]
--//     RETURN dst[127:0]
--//   }
--//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
- FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6508,19 +5704,9 @@ FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
- #endif
- }
- 
--// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
--// lower 4 signed or unsigned 16-bit integers in b.
--//
--//   r0 := a0
--//   r1 := b0
--//   r2 := a1
--//   r3 := b1
--//   r4 := a2
--//   r5 := b2
--//   r6 := a3
--//   r7 := b3
--//
--// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
-+// Unpack and interleave 16-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
- FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6534,15 +5720,9 @@ FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
- #endif
- }
- 
--// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
--// lower 2 signed or unsigned 32 - bit integers in b.
--//
--//   r0 := a0
--//   r1 := b0
--//   r2 := a1
--//   r3 := b1
--//
--// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
-+// Unpack and interleave 32-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
- FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6556,25 +5736,24 @@ FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
- #endif
- }
- 
-+// Unpack and interleave 64-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
- FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
- {
-+#if defined(__aarch64__)
-+    return vreinterpretq_m128i_s64(
-+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-+#else
-     int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
-     int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
-     return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-+#endif
- }
- 
--// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
--// 8 signed or unsigned 8-bit integers in b.
--//
--//   r0 := a0
--//   r1 := b0
--//   r2 := a1
--//   r3 := b1
--//   ...
--//   r14 := a7
--//   r15 := b7
--//
--// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
-+// Unpack and interleave 8-bit integers from the low half of a and b, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
- FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
- {
- #if defined(__aarch64__)
-@@ -6590,15 +5769,7 @@ FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
- 
- // Unpack and interleave double-precision (64-bit) floating-point elements from
- // the low half of a and b, and store the results in dst.
--//
--//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
--//     dst[63:0] := src1[63:0]
--//     dst[127:64] := src2[63:0]
--//     RETURN dst[127:0]
--//   }
--//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
- FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6613,21 +5784,16 @@ FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
- 
- // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
- // elements in a and b, and store the results in dst.
--//
--//   FOR j := 0 to 1
--//      i := j*64
--//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
- FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
- {
-     return vreinterpretq_m128d_s64(
-         veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
- }
- 
--// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
--// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
-+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
-+// and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
- FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-@@ -6639,17 +5805,7 @@ FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
- // Alternatively add and subtract packed double-precision (64-bit)
- // floating-point elements in a to/from packed elements in b, and store the
- // results in dst.
--//
--// FOR j := 0 to 1
--//   i := j*64
--//   IF ((j & 1) == 0)
--//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
--//   ELSE
--//     dst[i+63:i] := a[i+63:i] + b[i+63:i]
--//   FI
--// ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
- FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
- {
-     _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
-@@ -6665,7 +5821,7 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
- // Alternatively add and subtract packed single-precision (32-bit)
- // floating-point elements in a to/from packed elements in b, and store the
- // results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
- FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
- {
-     _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
-@@ -6680,7 +5836,7 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
- 
- // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
- // elements in a and b, and pack the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
- FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
- {
- #if defined(__aarch64__)
-@@ -6694,9 +5850,9 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
- #endif
- }
- 
--// Computes pairwise add of each argument as single-precision, floating-point
--// values a and b.
--// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
-+// elements in a and b, and pack the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
- FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
- {
- #if defined(__aarch64__)
-@@ -6714,7 +5870,7 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
- 
- // Horizontally subtract adjacent pairs of double-precision (64-bit)
- // floating-point elements in a and b, and pack the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
- FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
- {
- #if defined(__aarch64__)
-@@ -6732,7 +5888,7 @@ FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
- 
- // Horizontally subtract adjacent pairs of single-precision (32-bit)
- // floating-point elements in a and b, and pack the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
- FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
- {
-     float32x4_t a = vreinterpretq_f32_m128(_a);
-@@ -6749,24 +5905,17 @@ FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
- // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
- // may perform better than _mm_loadu_si128 when the data crosses a cache line
- // boundary.
--//
--//   dst[127:0] := MEM[mem_addr+127:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
- #define _mm_lddqu_si128 _mm_loadu_si128
- 
- // Load a double-precision (64-bit) floating-point element from memory into both
- // elements of dst.
--//
--//   dst[63:0] := MEM[mem_addr+63:mem_addr]
--//   dst[127:64] := MEM[mem_addr+63:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
- #define _mm_loaddup_pd _mm_load1_pd
- 
- // Duplicate the low double-precision (64-bit) floating-point element from a,
- // and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
- FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -6780,7 +5929,7 @@ FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
- 
- // Duplicate odd-indexed single-precision (32-bit) floating-point elements
- // from a, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
- FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
- {
- #if defined(__aarch64__)
-@@ -6799,7 +5948,7 @@ FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
- 
- // Duplicate even-indexed single-precision (32-bit) floating-point elements
- // from a, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
- FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
- {
- #if defined(__aarch64__)
-@@ -6820,13 +5969,7 @@ FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
- 
- // Compute the absolute value of packed signed 16-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*16
--//     dst[i+15:i] := ABS(a[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
- FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
- {
-     return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-@@ -6834,13 +5977,7 @@ FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
- 
- // Compute the absolute value of packed signed 32-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*32
--//     dst[i+31:i] := ABS(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
- FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
- {
-     return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-@@ -6848,13 +5985,7 @@ FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
- 
- // Compute the absolute value of packed signed 8-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 15
--//     i := j*8
--//     dst[i+7:i] := ABS(a[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
- FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
- {
-     return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-@@ -6862,13 +5993,7 @@ FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
- 
- // Compute the absolute value of packed signed 16-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 3
--//     i := j*16
--//     dst[i+15:i] := ABS(a[i+15:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
- FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
- {
-     return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
-@@ -6876,13 +6001,7 @@ FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
- 
- // Compute the absolute value of packed signed 32-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 1
--//     i := j*32
--//     dst[i+31:i] := ABS(a[i+31:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
- FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
- {
-     return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
-@@ -6890,13 +6009,7 @@ FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
- 
- // Compute the absolute value of packed signed 8-bit integers in a, and store
- // the unsigned results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*8
--//     dst[i+7:i] := ABS(a[i+7:i])
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
- FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
- {
-     return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-@@ -6904,11 +6017,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
- 
- // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
- // the result right by imm8 bytes, and store the low 16 bytes in dst.
--//
--//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
--//   dst[127:0] := tmp[127:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
- #define _mm_alignr_epi8(a, b, imm)                                            \
-     __extension__({                                                           \
-         uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
-@@ -6926,11 +6035,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
- 
- // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
- // the result right by imm8 bytes, and store the low 8 bytes in dst.
--//
--//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
--//   dst[63:0] := tmp[63:0]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
- #define _mm_alignr_pi8(a, b, imm)                                           \
-     __extension__({                                                         \
-         __m64 ret;                                                          \
-@@ -6953,8 +6058,9 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-         ret;                                                                \
-     })
- 
--// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
--// values a and b.
-+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-+// signed 16-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
- FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
- {
-     int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -6968,8 +6074,9 @@ FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
- #endif
- }
- 
--// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
--// values a and b.
-+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-+// signed 32-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
- FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
- {
-     int32x4_t a = vreinterpretq_s32_m128i(_a);
-@@ -6985,7 +6092,7 @@ FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
- 
- // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
- // signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
- FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s16(
-@@ -6994,15 +6101,16 @@ FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
- 
- // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
- // signed 32-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
- FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
- {
-     return vreinterpret_m64_s32(
-         vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
- }
- 
--// Computes saturated pairwise sub of each argument as a 16-bit signed
--// integer values a and b.
-+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
-+// saturation, and pack the signed 16-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
- FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
- {
- #if defined(__aarch64__)
-@@ -7025,7 +6133,7 @@ FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
- 
- // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
- // saturation, and pack the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
- FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
- {
-     int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7040,7 +6148,7 @@ FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
- 
- // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
- // the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
- FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
- {
-     int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -7056,7 +6164,7 @@ FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
- 
- // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
- // the signed 32-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
- FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
- {
-     int32x4_t a = vreinterpretq_s32_m128i(_a);
-@@ -7072,7 +6180,7 @@ FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
- 
- // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
- // the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
- FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
- {
-     int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7087,7 +6195,7 @@ FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
- 
- // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
- // the signed 32-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
- FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
- {
-     int32x2_t a = vreinterpret_s32_m64(_a);
-@@ -7100,9 +6208,9 @@ FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
- #endif
- }
- 
--// Computes saturated pairwise difference of each argument as a 16-bit signed
--// integer values a and b.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
-+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
-+// using saturation, and pack the signed 16-bit results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
- FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
- {
-     int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -7118,7 +6226,7 @@ FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
- 
- // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
- // using saturation, and pack the signed 16-bit results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
- FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
- {
-     int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7135,12 +6243,7 @@ FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
- // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
- // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
- // and pack the saturated results in dst.
--//
--//   FOR j := 0 to 7
--//      i := j*16
--//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
--//      a[i+7:i]*b[i+7:i] )
--//   ENDFOR
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
- FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
- {
- #if defined(__aarch64__)
-@@ -7179,7 +6282,7 @@ FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
- // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
- // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
- // pack the saturated results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
- FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
- {
-     uint16x4_t a = vreinterpret_u16_m64(_a);
-@@ -7204,12 +6307,7 @@ FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
- // Multiply packed signed 16-bit integers in a and b, producing intermediate
- // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
- // the packed 16-bit integers in dst.
--//
--//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
--//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
--//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
--//   ...
--//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
- FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
- {
-     // Has issues due to saturation
-@@ -7233,7 +6331,7 @@ FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
- // Multiply packed signed 16-bit integers in a and b, producing intermediate
- // signed 32-bit integers. Truncate each intermediate integer to the 18 most
- // significant bits, round by adding 1, and store bits [16:1] to dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
- FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
- {
-     int32x4_t mul_extend =
-@@ -7245,7 +6343,7 @@ FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
- 
- // Shuffle packed 8-bit integers in a according to shuffle control mask in the
- // corresponding 8-bit element of b, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
- FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
- {
-     int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
-@@ -7275,18 +6373,7 @@ FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
- 
- // Shuffle packed 8-bit integers in a according to shuffle control mask in the
- // corresponding 8-bit element of b, and store the results in dst.
--//
--//   FOR j := 0 to 7
--//     i := j*8
--//     IF b[i+7] == 1
--//       dst[i+7:i] := 0
--//     ELSE
--//       index[2:0] := b[i+2:i]
--//       dst[i+7:i] := a[index*8+7:index*8]
--//     FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
- FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
- {
-     const int8x8_t controlMask =
-@@ -7299,16 +6386,7 @@ FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
- // 16-bit integer in b is negative, and store the results in dst.
- // Element in dst are zeroed out when the corresponding element
- // in b is zero.
--//
--//   for i in 0..7
--//     if b[i] < 0
--//       r[i] := -a[i]
--//     else if b[i] == 0
--//       r[i] := 0
--//     else
--//       r[i] := a[i]
--//     fi
--//   done
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
- FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
- {
-     int16x8_t a = vreinterpretq_s16_m128i(_a);
-@@ -7336,16 +6414,7 @@ FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
- // 32-bit integer in b is negative, and store the results in dst.
- // Element in dst are zeroed out when the corresponding element
- // in b is zero.
--//
--//   for i in 0..3
--//     if b[i] < 0
--//       r[i] := -a[i]
--//     else if b[i] == 0
--//       r[i] := 0
--//     else
--//       r[i] := a[i]
--//     fi
--//   done
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
- FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
- {
-     int32x4_t a = vreinterpretq_s32_m128i(_a);
-@@ -7374,16 +6443,7 @@ FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
- // 8-bit integer in b is negative, and store the results in dst.
- // Element in dst are zeroed out when the corresponding element
- // in b is zero.
--//
--//   for i in 0..15
--//     if b[i] < 0
--//       r[i] := -a[i]
--//     else if b[i] == 0
--//       r[i] := 0
--//     else
--//       r[i] := a[i]
--//     fi
--//   done
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
- FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
- {
-     int8x16_t a = vreinterpretq_s8_m128i(_a);
-@@ -7412,19 +6472,7 @@ FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
- // Negate packed 16-bit integers in a when the corresponding signed 16-bit
- // integer in b is negative, and store the results in dst. Element in dst are
- // zeroed out when the corresponding element in b is zero.
--//
--//   FOR j := 0 to 3
--//      i := j*16
--//      IF b[i+15:i] < 0
--//        dst[i+15:i] := -(a[i+15:i])
--//      ELSE IF b[i+15:i] == 0
--//        dst[i+15:i] := 0
--//      ELSE
--//        dst[i+15:i] := a[i+15:i]
--//      FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
- FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
- {
-     int16x4_t a = vreinterpret_s16_m64(_a);
-@@ -7453,19 +6501,7 @@ FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
- // Negate packed 32-bit integers in a when the corresponding signed 32-bit
- // integer in b is negative, and store the results in dst. Element in dst are
- // zeroed out when the corresponding element in b is zero.
--//
--//   FOR j := 0 to 1
--//      i := j*32
--//      IF b[i+31:i] < 0
--//        dst[i+31:i] := -(a[i+31:i])
--//      ELSE IF b[i+31:i] == 0
--//        dst[i+31:i] := 0
--//      ELSE
--//        dst[i+31:i] := a[i+31:i]
--//      FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
- FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
- {
-     int32x2_t a = vreinterpret_s32_m64(_a);
-@@ -7494,19 +6530,7 @@ FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
- // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
- // in b is negative, and store the results in dst. Element in dst are zeroed out
- // when the corresponding element in b is zero.
--//
--//   FOR j := 0 to 7
--//      i := j*8
--//      IF b[i+7:i] < 0
--//        dst[i+7:i] := -(a[i+7:i])
--//      ELSE IF b[i+7:i] == 0
--//        dst[i+7:i] := 0
--//      ELSE
--//        dst[i+7:i] := a[i+7:i]
--//      FI
--//   ENDFOR
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
- FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
- {
-     int8x8_t a = vreinterpret_s8_m64(_a);
-@@ -7536,15 +6560,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
- 
- // Blend packed 16-bit integers from a and b using control mask imm8, and store
- // the results in dst.
--//
--//   FOR j := 0 to 7
--//       i := j*16
--//       IF imm8[j]
--//           dst[i+15:i] := b[i+15:i]
--//       ELSE
--//           dst[i+15:i] := a[i+15:i]
--//       FI
--//   ENDFOR
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
- // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
- //                                      __constrange(0,255) int imm)
- #define _mm_blend_epi16(a, b, imm)                                            \
-@@ -7565,7 +6581,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
- 
- // Blend packed double-precision (64-bit) floating-point elements from a and b
- // using control mask imm8, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
- #define _mm_blend_pd(a, b, imm)                                \
-     __extension__({                                            \
-         const uint64_t _mask[2] = {                            \
-@@ -7579,7 +6595,7 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
- 
- // Blend packed single-precision (32-bit) floating-point elements from a and b
- // using mask, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
- FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
- {
-     const uint32_t ALIGN_STRUCT(16)
-@@ -7595,15 +6611,7 @@ FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
- 
- // Blend packed 8-bit integers from a and b using mask, and store the results in
- // dst.
--//
--//   FOR j := 0 to 15
--//       i := j*8
--//       IF mask[i+7]
--//           dst[i+7:i] := b[i+7:i]
--//       ELSE
--//           dst[i+7:i] := a[i+7:i]
--//       FI
--//   ENDFOR
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
- FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
- {
-     // Use a signed shift right to create a mask with the sign bit
-@@ -7616,7 +6624,7 @@ FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
- 
- // Blend packed double-precision (64-bit) floating-point elements from a and b
- // using mask, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
- FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
- {
-     uint64x2_t mask =
-@@ -7634,7 +6642,7 @@ FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
- 
- // Blend packed single-precision (32-bit) floating-point elements from a and b
- // using mask, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
- FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
- {
-     // Use a signed shift right to create a mask with the sign bit
-@@ -7648,7 +6656,7 @@ FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
- // Round the packed double-precision (64-bit) floating-point elements in a up
- // to an integer value, and store the results as packed double-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
- FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -7662,7 +6670,7 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
- // Round the packed single-precision (32-bit) floating-point elements in a up to
- // an integer value, and store the results as packed single-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
- FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -7677,7 +6685,7 @@ FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
- // an integer value, store the result as a double-precision floating-point
- // element in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
- FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_ceil_pd(b));
-@@ -7687,11 +6695,7 @@ FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
- // an integer value, store the result as a single-precision floating-point
- // element in the lower element of dst, and copy the upper 3 packed elements
- // from a to the upper elements of dst.
--//
--//   dst[31:0] := CEIL(b[31:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
- FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_ceil_ps(b));
-@@ -7714,16 +6718,18 @@ FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
- #endif
- }
- 
--// Converts the four signed 16-bit integers in the lower 64 bits to four signed
--// 32-bit integers.
-+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
- FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
- {
-     return vreinterpretq_m128i_s32(
-         vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
- }
- 
--// Converts the two signed 16-bit integers in the lower 32 bits two signed
--// 32-bit integers.
-+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
- FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
- {
-     int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-@@ -7732,16 +6738,18 @@ FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
-     return vreinterpretq_m128i_s64(s64x2);
- }
- 
--// Converts the two signed 32-bit integers in the lower 64 bits to two signed
--// 64-bit integers.
-+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
- FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
- {
-     return vreinterpretq_m128i_s64(
-         vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
- }
- 
--// Converts the four unsigned 8-bit integers in the lower 16 bits to four
--// unsigned 32-bit integers.
-+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
- FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
- {
-     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-@@ -7749,8 +6757,9 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
-     return vreinterpretq_m128i_s16(s16x8);
- }
- 
--// Converts the four unsigned 8-bit integers in the lower 32 bits to four
--// unsigned 32-bit integers.
-+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
-+// the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
- FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
- {
-     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-@@ -7759,8 +6768,9 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
-     return vreinterpretq_m128i_s32(s32x4);
- }
- 
--// Converts the two signed 8-bit integers in the lower 32 bits to four
--// signed 64-bit integers.
-+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
-+// integers, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
- FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
- {
-     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-@@ -7770,16 +6780,18 @@ FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
-     return vreinterpretq_m128i_s64(s64x2);
- }
- 
--// Converts the four unsigned 16-bit integers in the lower 64 bits to four
--// unsigned 32-bit integers.
-+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
- FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
- {
-     return vreinterpretq_m128i_u32(
-         vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
- }
- 
--// Converts the two unsigned 16-bit integers in the lower 32 bits to two
--// unsigned 64-bit integers.
-+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
- FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
- {
-     uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-@@ -7788,8 +6800,9 @@ FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
-     return vreinterpretq_m128i_u64(u64x2);
- }
- 
--// Converts the two unsigned 32-bit integers in the lower 64 bits to two
--// unsigned 64-bit integers.
-+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
- FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
- {
-     return vreinterpretq_m128i_u64(
-@@ -7798,7 +6811,7 @@ FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
- 
- // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
- // and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
- FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
- {
-     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
-@@ -7806,9 +6819,9 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
-     return vreinterpretq_m128i_u16(u16x8);
- }
- 
--// Converts the four unsigned 8-bit integers in the lower 32 bits to four
--// unsigned 32-bit integers.
--// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
-+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
-+// and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
- FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
- {
-     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-@@ -7817,8 +6830,9 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-     return vreinterpretq_m128i_u32(u32x4);
- }
- 
--// Converts the two unsigned 8-bit integers in the lower 16 bits to two
--// unsigned 64-bit integers.
-+// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed
-+// 64-bit integers, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
- FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
- {
-     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-@@ -7831,7 +6845,7 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
- // Conditionally multiply the packed double-precision (64-bit) floating-point
- // elements in a and b using the high 4 bits in imm8, sum the four products, and
- // conditionally store the sum in dst using the low 4 bits of imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
- FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
- {
-     // Generate mask value from constant immediate bit value
-@@ -7877,7 +6891,7 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
- // Conditionally multiply the packed single-precision (32-bit) floating-point
- // elements in a and b using the high 4 bits in imm8, sum the four products,
- // and conditionally store the sum in dst using the low 4 bits of imm.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
- FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
- {
- #if defined(__aarch64__)
-@@ -7918,22 +6932,24 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
-     return vreinterpretq_m128_f32(res);
- }
- 
--// Extracts the selected signed or unsigned 32-bit integer from a and zero
--// extends.
-+// Extract a 32-bit integer from a, selected with imm8, and store the result in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
- // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
- #define _mm_extract_epi32(a, imm) \
-     vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
- 
--// Extracts the selected signed or unsigned 64-bit integer from a and zero
--// extends.
-+// Extract a 64-bit integer from a, selected with imm8, and store the result in
-+// dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
- // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
- #define _mm_extract_epi64(a, imm) \
-     vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
- 
--// Extracts the selected signed or unsigned 8-bit integer from a and zero
--// extends.
--// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
-+// Extract an 8-bit integer from a, selected with imm8, and store the result in
-+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
-+// __constrange(0,16) int imm)
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
- #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
- 
- // Extracts the selected single-precision (32-bit) floating-point from a.
-@@ -7943,7 +6959,7 @@ FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
- // Round the packed double-precision (64-bit) floating-point elements in a down
- // to an integer value, and store the results as packed double-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
- FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
- {
- #if defined(__aarch64__)
-@@ -7957,7 +6973,7 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
- // Round the packed single-precision (32-bit) floating-point elements in a down
- // to an integer value, and store the results as packed single-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
- FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
- {
- #if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
-@@ -7972,7 +6988,7 @@ FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
- // an integer value, store the result as a double-precision floating-point
- // element in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
- FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
- {
-     return _mm_move_sd(a, _mm_floor_pd(b));
-@@ -7982,18 +6998,15 @@ FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
- // an integer value, store the result as a single-precision floating-point
- // element in the lower element of dst, and copy the upper 3 packed elements
- // from a to the upper elements of dst.
--//
--//   dst[31:0] := FLOOR(b[31:0])
--//   dst[127:32] := a[127:32]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
- FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
- {
-     return _mm_move_ss(a, _mm_floor_ps(b));
- }
- 
--// Inserts the least significant 32 bits of b into the selected 32-bit integer
--// of a.
-+// Copy a to dst, and insert the 32-bit integer i into dst at the location
-+// specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
- // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
- //                                       __constrange(0,4) int imm)
- #define _mm_insert_epi32(a, b, imm)                                  \
-@@ -8002,8 +7015,9 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
-             vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
-     })
- 
--// Inserts the least significant 64 bits of b into the selected 64-bit integer
--// of a.
-+// Copy a to dst, and insert the 64-bit integer i into dst at the location
-+// specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
- // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
- //                                       __constrange(0,2) int imm)
- #define _mm_insert_epi64(a, b, imm)                                  \
-@@ -8012,8 +7026,9 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
-             vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
-     })
- 
--// Inserts the least significant 8 bits of b into the selected 8-bit integer
--// of a.
-+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
-+// location specified by imm8.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
- // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
- //                                      __constrange(0,16) int imm)
- #define _mm_insert_epi8(a, b, imm)                                 \
-@@ -8025,7 +7040,7 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
- // Copy a to tmp, then insert a single-precision (32-bit) floating-point
- // element from b into tmp using the control in imm8. Store tmp to dst using
- // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
- #define _mm_insert_ps(a, b, imm8)                                              \
-     __extension__({                                                            \
-         float32x4_t tmp1 =                                                     \
-@@ -8045,17 +7060,9 @@ FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
-             vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
-     })
- 
--// epi versions of min/max
--// Computes the pariwise maximums of the four signed 32-bit integer values of a
--// and b.
--//
--// A 128-bit parameter that can be defined with the following equations:
--//   r0 := (a0 > b0) ? a0 : b0
--//   r1 := (a1 > b1) ? a1 : b1
--//   r2 := (a2 > b2) ? a2 : b2
--//   r3 := (a3 > b3) ? a3 : b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b, and store packed maximum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
- FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-@@ -8064,7 +7071,7 @@ FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
- 
- // Compare packed signed 8-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
- FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -8073,7 +7080,7 @@ FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
- 
- // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
- FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-@@ -8082,23 +7089,16 @@ FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
- 
- // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
- FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u32(
-         vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
- }
- 
--// Computes the pariwise minima of the four signed 32-bit integer values of a
--// and b.
--//
--// A 128-bit parameter that can be defined with the following equations:
--//   r0 := (a0 < b0) ? a0 : b0
--//   r1 := (a1 < b1) ? a1 : b1
--//   r2 := (a2 < b2) ? a2 : b2
--//   r3 := (a3 < b3) ? a3 : b3
--//
--// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
-+// Compare packed signed 32-bit integers in a and b, and store packed minimum
-+// values in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
- FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-@@ -8107,7 +7107,7 @@ FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
- 
- // Compare packed signed 8-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
- FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s8(
-@@ -8116,7 +7116,7 @@ FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
- 
- // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
- FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-@@ -8125,7 +7125,7 @@ FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
- 
- // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
- // values in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
- FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u32(
-@@ -8134,21 +7134,7 @@ FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
- 
- // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
- // in a, store the minimum and index in dst, and zero the remaining bits in dst.
--//
--//   index[2:0] := 0
--//   min[15:0] := a[15:0]
--//   FOR j := 0 to 7
--//       i := j*16
--//       IF a[i+15:i] < min[15:0]
--//           index[2:0] := j
--//           min[15:0] := a[i+15:i]
--//       FI
--//   ENDFOR
--//   dst[15:0] := min[15:0]
--//   dst[18:16] := index[2:0]
--//   dst[127:19] := 0
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
- FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
- {
-     __m128i dst;
-@@ -8198,7 +7184,7 @@ FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
- // quadruplets from a. One quadruplet is selected from b starting at on the
- // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
- // integers selected from a starting at the offset specified in imm8.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
- FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
- {
-     uint8x16_t _a, _b;
-@@ -8278,9 +7264,7 @@ FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
- 
- // Multiply the low signed 32-bit integers from each packed 64-bit element in
- // a and b, and store the signed 64-bit results in dst.
--//
--//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
--//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
- FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
- {
-     // vmull_s32 upcasts instead of masking, so we downcast.
-@@ -8289,26 +7273,18 @@ FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
-     return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
- }
- 
--// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
--// unsigned 32-bit integers from b.
--// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
-+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
-+// integers, and store the low 32 bits of the intermediate integers in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
- FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_s32(
-         vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
- }
- 
--// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
--// integers and saturates.
--//
--//   r0 := UnsignedSaturate(a0)
--//   r1 := UnsignedSaturate(a1)
--//   r2 := UnsignedSaturate(a2)
--//   r3 := UnsignedSaturate(a3)
--//   r4 := UnsignedSaturate(b0)
--//   r5 := UnsignedSaturate(b1)
--//   r6 := UnsignedSaturate(b2)
--//   r7 := UnsignedSaturate(b3)
-+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
-+// using unsigned saturation, and store the results in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
- FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
- {
-     return vreinterpretq_m128i_u16(
-@@ -8319,7 +7295,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
- // Round the packed double-precision (64-bit) floating-point elements in a using
- // the rounding parameter, and store the results as packed double-precision
- // floating-point elements in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
- FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
- {
- #if defined(__aarch64__)
-@@ -8448,7 +7424,7 @@ FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
- // the rounding parameter, store the result as a double-precision floating-point
- // element in the lower element of dst, and copy the upper element from a to the
- // upper element of dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
- FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
- {
-     return _mm_move_sd(a, _mm_round_pd(b, rounding));
-@@ -8468,7 +7444,7 @@ FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
- //     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
- //     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
- //     _MM_SET_ROUNDING_MODE
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
- FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
- {
-     return _mm_move_ss(a, _mm_round_ps(b, rounding));
-@@ -8477,10 +7453,7 @@ FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
- // Load 128-bits of integer data from memory into dst using a non-temporal
- // memory hint. mem_addr must be aligned on a 16-byte boundary or a
- // general-protection exception may be generated.
--//
--//   dst[127:0] := MEM[mem_addr+127:mem_addr]
--//
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
- FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
- {
- #if __has_builtin(__builtin_nontemporal_store)
-@@ -8492,7 +7465,7 @@ FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
- 
- // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
- // all 1's, and return 1 if the result is zero, otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
- FORCE_INLINE int _mm_test_all_ones(__m128i a)
- {
-     return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-@@ -8501,7 +7474,7 @@ FORCE_INLINE int _mm_test_all_ones(__m128i a)
- 
- // Compute the bitwise AND of 128 bits (representing integer data) in a and
- // mask, and return 1 if the result is zero, otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
- FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
- {
-     int64x2_t a_and_mask =
-@@ -8514,7 +7487,7 @@ FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
- // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
- // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
- // otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
- FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
- {
-     uint64x2_t zf =
-@@ -8529,7 +7502,7 @@ FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
- // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
- // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
- // otherwise set CF to 0. Return the CF value.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
- FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
- {
-     int64x2_t s64 =
-@@ -8542,14 +7515,14 @@ FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
- // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
- // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
- // otherwise return 0.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
- #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
- 
- // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
- // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
- // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
- // otherwise set CF to 0. Return the ZF value.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
- FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
- {
-     int64x2_t s64 =
-@@ -9028,7 +8001,7 @@ FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
- FORCE_INLINE int _sse2neon_clz(unsigned int x)
- {
- #if _MSC_VER
--    DWORD cnt = 0;
-+    unsigned long cnt = 0;
-     if (_BitScanForward(&cnt, x))
-         return cnt;
-     return 32;
-@@ -9040,7 +8013,7 @@ FORCE_INLINE int _sse2neon_clz(unsigned int x)
- FORCE_INLINE int _sse2neon_ctz(unsigned int x)
- {
- #if _MSC_VER
--    DWORD cnt = 0;
-+    unsigned long cnt = 0;
-     if (_BitScanReverse(&cnt, x))
-         return 31 - cnt;
-     return 32;
-@@ -9053,18 +8026,16 @@ FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
- {
- #if _MSC_VER
-     unsigned long cnt;
--#ifdef defined(SSE2NEON_HAS_BITSCAN64)
--    (defined(_M_AMD64) || defined(__x86_64__))
--        if((_BitScanForward64(&cnt, x))
--            return (int)(cnt);
-+#if defined(SSE2NEON_HAS_BITSCAN64)
-+    if ((_BitScanForward64(&cnt, x))
-+        return (int)(cnt);
- #else
-     if (_BitScanForward(&cnt, (unsigned long) (x)))
-         return (int) cnt;
-     if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
-         return (int) (cnt + 32);
--#endif
--    return 64;
--#else
-+#endif /* SSE2NEON_HAS_BITSCAN64 */
-+#else  /* assume GNU compatible compilers */
-     return x != 0 ? __builtin_ctzll(x) : 64;
- #endif
- }
-@@ -9155,7 +8126,7 @@ FORCE_INLINE int _mm_cmpestrc(__m128i a,
- 
- // Compare packed strings in a and b with lengths la and lb using the control
- // in imm8, and store the generated index in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestri
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
- FORCE_INLINE int _mm_cmpestri(__m128i a,
-                               int la,
-                               __m128i b,
-@@ -9168,7 +8139,7 @@ FORCE_INLINE int _mm_cmpestri(__m128i a,
- 
- // Compare packed strings in a and b with lengths la and lb using the control
- // in imm8, and store the generated mask in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrm
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
- FORCE_INLINE __m128i
- _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
- {
-@@ -9324,8 +8295,8 @@ FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
- }
- 
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 16-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
-+// unsigned 16-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
- FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9342,8 +8313,8 @@ FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
- }
- 
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 32-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
-+// unsigned 32-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
- FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9360,8 +8331,8 @@ FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
- }
- 
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 64-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
-+// unsigned 64-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
- FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9376,8 +8347,8 @@ FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
- }
- 
- // Starting with the initial value in crc, accumulates a CRC32 value for
--// unsigned 8-bit integer v.
--// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
-+// unsigned 8-bit integer v, and stores the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
- FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
- {
- #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-@@ -9486,43 +8457,61 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
- 
- /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
- #define SSE2NEON_AES_H0(x) (x)
--static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
--static const uint8_t SSE2NEON_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
-+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
-+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
- #undef SSE2NEON_AES_H0
- 
--// In the absence of crypto extensions, implement aesenc using regular neon
-+/* x_time function and matrix multiply function */
-+#if !defined(__aarch64__)
-+#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
-+#define SSE2NEON_MULTIPLY(x, y)                                  \
-+    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
-+     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
-+     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
-+     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
-+#endif
-+
-+// In the absence of crypto extensions, implement aesenc using regular NEON
- // intrinsics instead. See:
- // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
- // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
--// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
--// for more information Reproduced with permission of the author.
-+// for more information.
- FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
- {
- #if defined(__aarch64__)
--    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
--                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
--                                         0xc, 0x1, 0x6, 0xb};
--    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
--                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
-+    static const uint8_t shift_rows[] = {
-+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
-+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
-+    };
-+    static const uint8_t ror32by8[] = {
-+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-+    };
- 
-     uint8x16_t v;
-     uint8x16_t w = vreinterpretq_u8_m128i(a);
- 
--    // shift rows
-+    /* shift rows */
-     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
- 
--    // sub bytes
--    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
-+    /* sub bytes */
-+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
-+    // look up each of the table. After each lookup, we load the next table
-+    // which locates at the next 64-bytes. In the meantime, the index in the
-+    // table would be smaller than it was, so the index parameters of
-+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
-+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
- 
--    // mix columns
-+    /* mix columns */
-     w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
- 
--    //  add round key
-+    /* add round key */
-     return vreinterpretq_m128i_u8(w) ^ RoundKey;
- 
- #else /* ARMv7-A implementation for a table-based AES */
-@@ -9587,31 +8576,34 @@ FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
- FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
- {
- #if defined(__aarch64__)
--    static const uint8_t inv_shift_rows[] = {0x0, 0xd, 0xa, 0x7, 0x4, 0x1,
--                                             0xe, 0xb, 0x8, 0x5, 0x2, 0xf,
--                                             0xc, 0x9, 0x6, 0x3};
--    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
--                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
-+    static const uint8_t inv_shift_rows[] = {
-+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-+    };
-+    static const uint8_t ror32by8[] = {
-+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-+    };
- 
-     uint8x16_t v;
-     uint8x16_t w = vreinterpretq_u8_m128i(a);
- 
--    // shift rows
-+    // inverse shift rows
-     w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
- 
--    // sub bytes
--    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_rsbox), w);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0x40), w - 0x40);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0x80), w - 0x80);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_rsbox + 0xc0), w - 0xc0);
-+    // inverse sub bytes
-+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
- 
-+    // inverse mix columns
-     // muliplying 'v' by 4 in GF(2^8)
-     w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-     w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-     v ^= w;
-     v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
- 
--    // mix columns
-     w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
-                                  0x1b);  // muliplying 'v' by 2 in GF(2^8)
-     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-@@ -9621,35 +8613,29 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-     return vreinterpretq_m128i_u8(w) ^ RoundKey;
- 
- #else /* ARMv7-A NEON implementation */
--/* FIXME: optimized for NEON */
--#define XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
--#define MULTIPLY(x, y)                                                     \
--    (((y & 1) * x) ^ ((y >> 1 & 1) * XT(x)) ^ ((y >> 2 & 1) * XT(XT(x))) ^ \
--     ((y >> 3 & 1) * XT(XT(XT(x)))) ^ ((y >> 4 & 1) * XT(XT(XT(XT(x))))))
--
-+    /* FIXME: optimized for NEON */
-     uint8_t i, e, f, g, h, v[4][4];
-     uint8_t *_a = (uint8_t *) &a;
-     for (i = 0; i < 16; ++i) {
--        v[((i / 4) + (i % 4)) % 4][i % 4] = SSE2NEON_rsbox[_a[i]];
-+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-     }
- 
-+    // inverse mix columns
-     for (i = 0; i < 4; ++i) {
-         e = v[i][0];
-         f = v[i][1];
-         g = v[i][2];
-         h = v[i][3];
- 
--        v[i][0] = MULTIPLY(e, 0x0e) ^ MULTIPLY(f, 0x0b) ^ MULTIPLY(g, 0x0d) ^
--                  MULTIPLY(h, 0x09);
--        v[i][1] = MULTIPLY(e, 0x09) ^ MULTIPLY(f, 0x0e) ^ MULTIPLY(g, 0x0b) ^
--                  MULTIPLY(h, 0x0d);
--        v[i][2] = MULTIPLY(e, 0x0d) ^ MULTIPLY(f, 0x09) ^ MULTIPLY(g, 0x0e) ^
--                  MULTIPLY(h, 0x0b);
--        v[i][3] = MULTIPLY(e, 0x0b) ^ MULTIPLY(f, 0x0d) ^ MULTIPLY(g, 0x09) ^
--                  MULTIPLY(h, 0x0e);
-+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-     }
--#undef XT
--#undef MULTIPLY
- 
-     return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
- #endif
-@@ -9657,7 +8643,7 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
- 
- // Perform the last round of an AES encryption flow on data (state) in a using
- // the round key in RoundKey, and store the result in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
- FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
- {
- #if defined(__aarch64__)
-@@ -9673,59 +8659,166 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
- 
-     // sub bytes
--    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
--    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
--    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
-+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
- 
--    //  add round key
-+    // add round key
-     return vreinterpretq_m128i_u8(v) ^ RoundKey;
- 
- #else /* ARMv7-A implementation */
-     uint8_t v[16] = {
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
--        SSE2NEON_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
-+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
-     };
- 
-     return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
- #endif
- }
- 
-+// Perform the last round of an AES decryption flow on data (state) in a using
-+// the round key in RoundKey, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-+{
-+#if defined(__aarch64__)
-+    static const uint8_t inv_shift_rows[] = {
-+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
-+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
-+    };
-+
-+    uint8x16_t v;
-+    uint8x16_t w = vreinterpretq_u8_m128i(a);
-+
-+    // inverse shift rows
-+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
-+
-+    // inverse sub bytes
-+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
-+
-+    // add round key
-+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
-+
-+#else /* ARMv7-A NEON implementation */
-+    /* FIXME: optimized for NEON */
-+    uint8_t v[4][4];
-+    uint8_t *_a = (uint8_t *) &a;
-+    for (int i = 0; i < 16; ++i) {
-+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
-+    }
-+
-+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
-+#endif
-+}
-+
-+// Perform the InvMixColumns transformation on a and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-+{
-+#if defined(__aarch64__)
-+    static const uint8_t ror32by8[] = {
-+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
-+    };
-+    uint8x16_t v = vreinterpretq_u8_m128i(a);
-+    uint8x16_t w;
-+
-+    // multiplying 'v' by 4 in GF(2^8)
-+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
-+    v ^= w;
-+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
-+
-+    // multiplying 'v' by 2 in GF(2^8)
-+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
-+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-+    return vreinterpretq_m128i_u8(w);
-+
-+#else /* ARMv7-A NEON implementation */
-+    uint8_t i, e, f, g, h, v[4][4];
-+    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
-+    for (i = 0; i < 4; ++i) {
-+        e = v[i][0];
-+        f = v[i][1];
-+        g = v[i][2];
-+        h = v[i][3];
-+
-+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
-+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
-+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
-+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
-+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
-+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
-+    }
-+
-+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
-+#endif
-+}
-+
-+// Assist in expanding the AES cipher key by computing steps towards generating
-+// a round key for encryption cipher using data from a and an 8-bit round
-+// constant specified in imm8, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
-+//
- // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
- // This instruction generates a round key for AES encryption. See
- // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
- // for details.
--//
--// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
--FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
-+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
- {
--    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
--    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
-+#if defined(__aarch64__)
-+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
-+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
-+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
-+
-+    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
-+    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
-+    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
-+
-+    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
-+
-+#else /* ARMv7-A NEON implementation */
-+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
-+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
-     for (int i = 0; i < 4; ++i) {
--        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
--        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
-+        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
-+        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
-     }
-     return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
-                          ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
-+#endif
- }
- #undef SSE2NEON_AES_SBOX
- #undef SSE2NEON_AES_RSBOX
- 
-+#if defined(__aarch64__)
-+#undef SSE2NEON_XT
-+#undef SSE2NEON_MULTIPLY
-+#endif
-+
- #else /* __ARM_FEATURE_CRYPTO */
- // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
- // AESMC and then manually applying the real key as an xor operation. This
-@@ -9750,7 +8843,9 @@ FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
-         vreinterpretq_u8_m128i(RoundKey)));
- }
- 
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
-+// Perform the last round of an AES encryption flow on data (state) in a using
-+// the round key in RoundKey, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
- FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
- {
-     return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
-@@ -9758,6 +8853,23 @@ FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-                          RoundKey);
- }
- 
-+// Perform the last round of an AES decryption flow on data (state) in a using
-+// the round key in RoundKey, and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
-+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
-+{
-+    return vreinterpretq_m128i_u8(
-+        vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^
-+        vreinterpretq_u8_m128i(RoundKey));
-+}
-+
-+// Perform the InvMixColumns transformation on a and store the result in dst.
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
-+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
-+{
-+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
-+}
-+
- // Assist in expanding the AES cipher key by computing steps towards generating
- // a round key for encryption cipher using data from a and an 8-bit round
- // constant specified in imm8, and store the result in dst."
-@@ -9783,7 +8895,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
- 
- // Perform a carry-less multiplication of two 64-bit integers, selected from a
- // and b according to imm8, and store the results in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
- FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
- {
-     uint64x2_t a = vreinterpretq_u64_m128i(_a);
-@@ -9828,7 +8940,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
- 
- // Count the number of bits set to 1 in unsigned 32-bit integer a, and
- // return that count in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
- FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
- {
- #if defined(__aarch64__)
-@@ -9855,7 +8967,7 @@ FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
- 
- // Count the number of bits set to 1 in unsigned 64-bit integer a, and
- // return that count in dst.
--// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
-+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
- FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
- {
- #if defined(__aarch64__)
-@@ -9911,7 +9023,6 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
- 
- // Return the current 64-bit value of the processor's time-stamp counter.
- // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
--
- FORCE_INLINE uint64_t _rdtsc(void)
- {
- #if defined(__aarch64__)
diff --git a/bazel/patches/emp-tool.patch b/bazel/patches/emp-tool.patch
deleted file mode 100644
index 48220f890..000000000
--- a/bazel/patches/emp-tool.patch
+++ /dev/null
@@ -1,163 +0,0 @@
-diff --git a/emp-tool/utils/aes.h b/emp-tool/utils/aes.h
-index 0235544..75a8486 100644
---- a/emp-tool/utils/aes.h
-+++ b/emp-tool/utils/aes.h
-@@ -54,6 +54,10 @@
- 
- #include "emp-tool/utils/block.h"
- 
-+#ifdef __aarch64__
-+#include "emp-tool/utils/sse2neon.h"
-+#endif
-+
- namespace emp {
- 
- typedef struct { block rd_key[11]; unsigned int rounds; } AES_KEY;
-@@ -103,6 +107,7 @@ AES_set_encrypt_key(const block userkey, AES_KEY *key) {
- 
- #ifdef __x86_64__
- __attribute__((target("aes,sse2")))
-+#endif
- inline void AES_ecb_encrypt_blks(block *blks, unsigned int nblks, const AES_KEY *key) {
-    for (unsigned int i = 0; i < nblks; ++i)
-       blks[i] = _mm_xor_si128(blks[i], key->rd_key[0]);
-@@ -112,22 +117,6 @@ inline void AES_ecb_encrypt_blks(block *blks, unsigned int nblks, const AES_KEY
-    for (unsigned int i = 0; i < nblks; ++i)
-       blks[i] = _mm_aesenclast_si128(blks[i], key->rd_key[key->rounds]);
- }
--#elif __aarch64__
--inline void AES_ecb_encrypt_blks(block *_blks, unsigned int nblks, const AES_KEY *key) {
--   uint8x16_t * blks = (uint8x16_t*)(_blks);
--   uint8x16_t * keys = (uint8x16_t*)(key->rd_key);
--   auto * first = blks;
--   for (unsigned int j = 0; j < key->rounds-1; ++j) {
--		uint8x16_t key_j = (uint8x16_t)keys[j];
--      blks = first;
--      for (unsigned int i = 0; i < nblks; ++i, ++blks)
--	       *blks = vaesmcq_u8(vaeseq_u8(*blks, key_j));
--   }
--	uint8x16_t last_key = (uint8x16_t)keys[key->rounds-1];
--	for (unsigned int i = 0; i < nblks; ++i, ++first)
--		 *first = vaeseq_u8(*first, last_key) ^ (uint8x16_t)keys[key->rounds];
--}
--#endif
- 
- #ifdef __GNUC__
- 	#ifndef __clang__
-diff --git a/emp-tool/utils/aes_opt.h b/emp-tool/utils/aes_opt.h
-index 2594e32..6a78b75 100644
---- a/emp-tool/utils/aes_opt.h
-+++ b/emp-tool/utils/aes_opt.h
-@@ -58,7 +58,6 @@ static inline void AES_opt_key_schedule(block* user_key, AES_KEY *keys) {
- /*
-  * With numKeys keys, use each key to encrypt numEncs blocks.
-  */
--#ifdef __x86_64__
- template<int numKeys, int numEncs>
- static inline void ParaEnc(block *blks, AES_KEY *keys) {
- 	block * first = blks;
-@@ -90,29 +89,6 @@ static inline void ParaEnc(block *blks, AES_KEY *keys) {
- 		}
- 	}
- }
--#elif __aarch64__
--template<int numKeys, int numEncs>
--static inline void ParaEnc(block *_blks, AES_KEY *keys) {
--	uint8x16_t * first = (uint8x16_t*)(_blks);
--
--	for (unsigned int r = 0; r < 9; ++r) { 
--		auto blks = first;
--		for(size_t i = 0; i < numKeys; ++i) {
--			uint8x16_t K = vreinterpretq_u8_m128i(keys[i].rd_key[r]);
--			for(size_t j = 0; j < numEncs; ++j, ++blks)
--			   *blks = vaesmcq_u8(vaeseq_u8(*blks, K));
--		}
--	}
--	
--	auto blks = first;
--	for(size_t i = 0; i < numKeys; ++i) {
--		uint8x16_t K = vreinterpretq_u8_m128i(keys[i].rd_key[9]);
--		uint8x16_t K2 = vreinterpretq_u8_m128i(keys[i].rd_key[10]);
--		for(size_t j = 0; j < numEncs; ++j, ++blks)
--			*blks = vaeseq_u8(*blks, K) ^ K2;
--	}
--}
--#endif
- 
- }
- #endif
-diff --git a/emp-tool/utils/block.h b/emp-tool/utils/block.h
-index f7d3d34..fcc21c1 100644
---- a/emp-tool/utils/block.h
-+++ b/emp-tool/utils/block.h
-@@ -5,16 +5,7 @@
- #include <immintrin.h>
- #elif __aarch64__
- #include "sse2neon.h"
--inline __m128i _mm_aesimc_si128(__m128i a) {
--	return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
--}
--
--inline __m128i _mm_aesdeclast_si128 (__m128i a, __m128i RoundKey)
--{
--    return vreinterpretq_m128i_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)) ^ vreinterpretq_u8_m128i(RoundKey));
--}
- #endif
--
- #include <assert.h>
- #include <cstring>
- #include <iostream>
-diff --git a/emp-tool/utils/f2k.h b/emp-tool/utils/f2k.h
-index 7fe1b1b..f6186a1 100644
---- a/emp-tool/utils/f2k.h
-+++ b/emp-tool/utils/f2k.h
-@@ -6,6 +6,7 @@ namespace emp {
- 	/* multiplication in galois field without reduction */
- 	#ifdef __x86_64__
- 	__attribute__((target("sse2,pclmul")))
-+	#endif
- 	inline void mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
- 		__m128i tmp3, tmp4, tmp5, tmp6;
- 		tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
-@@ -22,28 +23,6 @@ namespace emp {
- 		*res1 = tmp3;
- 		*res2 = tmp6;
- 	}
--	#elif __aarch64__
--	inline void mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
--		__m128i tmp3, tmp4, tmp5, tmp6;
--		poly64_t a_lo = (poly64_t)vget_low_u64(vreinterpretq_u64_m128i(a));
--		poly64_t a_hi = (poly64_t)vget_high_u64(vreinterpretq_u64_m128i(a));
--		poly64_t b_lo = (poly64_t)vget_low_u64(vreinterpretq_u64_m128i(b));
--		poly64_t b_hi = (poly64_t)vget_high_u64(vreinterpretq_u64_m128i(b));
--		tmp3 = (__m128i)vmull_p64(a_lo, b_lo);
--		tmp4 = (__m128i)vmull_p64(a_hi, b_lo);
--		tmp5 = (__m128i)vmull_p64(a_lo, b_hi);
--		tmp6 = (__m128i)vmull_p64(a_hi, b_hi);
--
--		tmp4 = _mm_xor_si128(tmp4, tmp5);
--		tmp5 = _mm_slli_si128(tmp4, 8);
--		tmp4 = _mm_srli_si128(tmp4, 8);
--		tmp3 = _mm_xor_si128(tmp3, tmp5);
--		tmp6 = _mm_xor_si128(tmp6, tmp4);
--		// initial mul now in tmp3, tmp6
--		*res1 = tmp3;
--		*res2 = tmp6;
--	}
--	#endif
- 
- 	/* multiplication in galois field with reduction */
- 	#ifdef __x86_64__
-diff --git a/emp-tool/utils/prg.h b/emp-tool/utils/prg.h
-index 23bbf42..5101d7e 100644
---- a/emp-tool/utils/prg.h
-+++ b/emp-tool/utils/prg.h
-@@ -82,7 +82,7 @@ class PRG { public:
- 		} else {
- 			block tmp[2];
- 			random_block(tmp, 2);
--			memcpy(data, tmp, nbytes);
-+			memcpy(data, tmp, nbytes <= 32? nbytes : 32);
- 		}
- 	}
- 
diff --git a/bazel/patches/grpc-1.66.patch b/bazel/patches/grpc-1.66.patch
new file mode 100644
index 000000000..b6f82e587
--- /dev/null
+++ b/bazel/patches/grpc-1.66.patch
@@ -0,0 +1,20 @@
+diff --git a/third_party/BUILD b/third_party/BUILD
+index 77cb52d0fc..c4b647f5c9 100644
+--- a/third_party/BUILD
++++ b/third_party/BUILD
+@@ -18,13 +18,13 @@ package(default_visibility = ["//:__subpackages__"])
+ 
+ alias(
+     name = "libssl",
+-    actual = "@boringssl//:ssl",
++    actual = "@openssl//:ssl",
+     tags = ["manual"],
+ )
+ 
+ alias(
+     name = "libcrypto",
+-    actual = "@boringssl//:crypto",
++    actual = "@openssl//:crypto",
+     tags = ["manual"],
+ )
+ 
diff --git a/bazel/patches/grpc-module-file.patch b/bazel/patches/grpc-module-file.patch
new file mode 100644
index 000000000..29dc393c1
--- /dev/null
+++ b/bazel/patches/grpc-module-file.patch
@@ -0,0 +1,13 @@
+diff --git a/MODULE.bazel b/MODULE.bazel
+index 4a8fbe83..8650f678 100644
+--- a/MODULE.bazel
++++ b/MODULE.bazel
+@@ -8,7 +8,7 @@ module(
+ bazel_dep(name = "abseil-cpp", version = "20240116.0", repo_name = "com_google_absl")
+ bazel_dep(name = "apple_support", version = "1.15.1", repo_name = "build_bazel_apple_support")
+ bazel_dep(name = "bazel_skylib", version = "1.5.0")
+-bazel_dep(name = "boringssl", version = "0.0.0-20230215-5c22014")
++bazel_dep(name = "openssl", version = "3.3.2")
+ bazel_dep(name = "c-ares", version = "1.15.0", repo_name = "com_github_cares_cares")
+ bazel_dep(name = "gazelle", version = "0.36.0", repo_name = "bazel_gazelle")
+ bazel_dep(name = "google_benchmark", version = "1.8.4", repo_name = "com_github_google_benchmark")
\ No newline at end of file
diff --git a/bazel/patches/hexl.patch b/bazel/patches/hexl.patch
deleted file mode 100644
index 4d498ca8f..000000000
--- a/bazel/patches/hexl.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index b0da96f..61bfdd8 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -113,10 +113,10 @@ message(STATUS "CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
- #------------------------------------------------------------------------------
- # Set AVX flags
- #------------------------------------------------------------------------------
--hexl_check_compile_flag("${HEXL_CMAKE_PATH}/test-avx512dq.cpp" HEXL_HAS_AVX512DQ)
--hexl_check_compile_flag("${HEXL_CMAKE_PATH}/test-avx512ifma.cpp" HEXL_HAS_AVX512IFMA)
--hexl_check_compile_flag("${HEXL_CMAKE_PATH}/test-avx512vbmi2.cpp" HEXL_HAS_AVX512VBMI2)
--hexl_check_compile_flag("${HEXL_CMAKE_PATH}/test-avx256.cpp" HEXL_HAS_AVX256)
-+# hexl_check_compile_flag("${HEXL_CMAKE_PATH}/test-avx512dq.cpp" HEXL_HAS_AVX512DQ)
-+# hexl_check_compile_flag("${HEXL_CMAKE_PATH}/test-avx512ifma.cpp" HEXL_HAS_AVX512IFMA)
-+# hexl_check_compile_flag("${HEXL_CMAKE_PATH}/test-avx512vbmi2.cpp" HEXL_HAS_AVX512VBMI2)
-+# hexl_check_compile_flag("${HEXL_CMAKE_PATH}/test-avx256.cpp" HEXL_HAS_AVX256)
-
- # ------------------------------------------------------------------------------
- # Installation logic...
-diff --git a/hexl/CMakeLists.txt b/hexl/CMakeLists.txt
-index 7c660a0..7e2e1c9 100644
---- a/hexl/CMakeLists.txt
-+++ b/hexl/CMakeLists.txt
-@@ -93,7 +93,7 @@ endif()
-
- if (CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
-     target_compile_options(hexl PRIVATE -Wall -Wconversion -Wshadow -pedantic -Wextra
--        -Wno-unknown-pragmas -march=native -O3 -fomit-frame-pointer
-+        -Wno-unknown-pragmas -mavx -O3 -fomit-frame-pointer
-         -Wno-sign-conversion
-         -Wno-implicit-int-conversion
-     )
diff --git a/bazel/patches/protobuf-xla.patch b/bazel/patches/protobuf-xla.patch
new file mode 100644
index 000000000..b09b9c49a
--- /dev/null
+++ b/bazel/patches/protobuf-xla.patch
@@ -0,0 +1,347 @@
+diff --git a/BUILD.bazel b/BUILD.bazel
+index 301a04656..b4d953fd2 100644
+--- a/BUILD.bazel
++++ b/BUILD.bazel
+@@ -8,7 +8,7 @@ load("//bazel:java_proto_library.bzl", "java_proto_library")
+ load("//bazel:proto_library.bzl", "proto_library")
+ load("//bazel/toolchains:proto_lang_toolchain.bzl", "proto_lang_toolchain")
+ load("//build_defs:cpp_opts.bzl", "COPTS", "LINK_OPTS")
+-load(":protobuf.bzl", "internal_objc_proto_library", "internal_php_proto_library", "internal_py_proto_library")
++load(":protobuf.bzl", "adapt_proto_library", "internal_objc_proto_library", "internal_php_proto_library", "internal_py_proto_library")
+ 
+ licenses(["notice"])
+ 
+@@ -192,6 +192,25 @@ cc_library(
+     visibility = ["//visibility:public"],
+ )
+ 
++adapt_proto_library(
++    name = "cc_wkt_protos_genproto",
++    visibility = ["//visibility:public"],
++    deps = [
++        "//:any_proto",
++        "//:api_proto",
++        "//:compiler_plugin_proto",
++        "//:descriptor_proto",
++        "//:duration_proto",
++        "//:empty_proto",
++        "//:field_mask_proto",
++        "//:source_context_proto",
++        "//:struct_proto",
++        "//:timestamp_proto",
++        "//:type_proto",
++        "//:wrappers_proto",
++    ],
++)
++
+ # Source protos that are typically part of the protobuf runtime.
+ #
+ # DEPRECATED: Prefer :well_known_type_protos for the Well-Known Types
+diff --git a/protobuf.bzl b/protobuf.bzl
+index 7db5146a0..a0e05d4d2 100644
+--- a/protobuf.bzl
++++ b/protobuf.bzl
+@@ -88,17 +88,17 @@ def _proto_gen_impl(ctx):
+     if source_dir:
+         has_sources = any([src.is_source for src in srcs])
+         if has_sources:
+-            import_flags += ["-I" + source_dir]
++            import_flags.append("-I" + source_dir)
+     else:
+-        import_flags += ["-I."]
++        import_flags.append("-I.")
+ 
+     has_generated = any([not src.is_source for src in srcs])
+     if has_generated:
+-        import_flags += ["-I" + gen_dir]
++        import_flags.append("-I" + gen_dir)
+ 
+     if ctx.attr.includes:
+         for include in ctx.attr.includes:
+-            import_flags += ["-I" + _GetPath(ctx, include)]
++            import_flags.append("-I" + _GetPath(ctx, include))
+ 
+     import_flags = depset(direct = import_flags)
+ 
+@@ -153,7 +153,7 @@ def _proto_gen_impl(ctx):
+                 outs.extend(_RubyOuts([src.basename]))
+ 
+             # Otherwise, rely on user-supplied outs.
+-            args += [("--%s_out=" + path_tpl) % (lang, gen_dir)]
++            args.append(("--%s_out=" + path_tpl) % (lang, gen_dir))
+ 
+         if ctx.attr.outs:
+             outs.extend(ctx.attr.outs)
+@@ -174,8 +174,8 @@ def _proto_gen_impl(ctx):
+ 
+             if ctx.attr.plugin_options:
+                 outdir = ",".join(ctx.attr.plugin_options) + ":" + outdir
+-            args += [("--plugin=protoc-gen-%s=" + path_tpl) % (lang, plugin.path)]
+-            args += ["--%s_out=%s" % (lang, outdir)]
++            args.append(("--plugin=protoc-gen-%s=" + path_tpl) % (lang, plugin.path))
++            args.append("--%s_out=%s" % (lang, outdir))
+             tools.append(plugin)
+ 
+         if not in_gen_dir:
+@@ -765,3 +765,261 @@ def check_protobuf_required_bazel_version():
+     copied filegroup. (Fixed in bazel 0.5.4)
+     """
+     versions.check(minimum_bazel_version = "0.5.4")
++
++def _CcHdrs(srcs, use_grpc_plugin = False):
++    ret = [s[:-len(".proto")] + ".pb.h" for s in srcs]
++    if use_grpc_plugin:
++        ret += [s[:-len(".proto")] + ".grpc.pb.h" for s in srcs]
++    return ret
++
++def _CcSrcs(srcs, use_grpc_plugin = False):
++    ret = [s[:-len(".proto")] + ".pb.cc" for s in srcs]
++    if use_grpc_plugin:
++        ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
++    return ret
++
++def __proto_gen_impl(ctx):
++    """General implementation for generating protos"""
++    srcs = ctx.files.srcs
++    deps = []
++    deps += ctx.files.srcs
++    source_dir = _SourceDir(ctx)
++    gen_dir = _GenDir(ctx)
++    if source_dir:
++        import_flags = ["-I" + source_dir, "-I" + gen_dir]
++    else:
++        import_flags = ["-I."]
++
++    for dep in ctx.attr.deps:
++        import_flags += dep.proto.import_flags
++        deps += dep.proto.deps
++    import_flags = depset(import_flags).to_list()
++    deps = depset(deps).to_list()
++
++    args = []
++    if ctx.attr.gen_cc:
++        args.append("--cpp_out=" + gen_dir)
++    if ctx.attr.gen_py:
++        args.append("--python_out=" + gen_dir)
++
++    inputs = srcs + deps
++    tools = [ctx.executable.protoc]
++    if ctx.executable.plugin:
++        plugin = ctx.executable.plugin
++        lang = ctx.attr.plugin_language
++        if not lang and plugin.basename.startswith("protoc-gen-"):
++            lang = plugin.basename[len("protoc-gen-"):]
++        if not lang:
++            fail("cannot infer the target language of plugin", "plugin_language")
++
++        outdir = gen_dir
++        if ctx.attr.plugin_options:
++            outdir = ",".join(ctx.attr.plugin_options) + ":" + outdir
++        args.append("--plugin=protoc-gen-%s=%s" % (lang, plugin.path))
++        args.append("--%s_out=%s" % (lang, outdir))
++        tools.append(plugin)
++
++    if args:
++        ctx.actions.run(
++            inputs = inputs,
++            outputs = ctx.outputs.outs,
++            arguments = args + import_flags + [s.path for s in srcs],
++            executable = ctx.executable.protoc,
++            mnemonic = "ProtoCompile",
++            tools = tools,
++            use_default_shell_env = True,
++        )
++
++    return struct(
++        proto = struct(
++            srcs = srcs,
++            import_flags = import_flags,
++            deps = deps,
++        ),
++    )
++
++proto_gen = rule(
++    attrs = {
++        "srcs": attr.label_list(allow_files = True),
++        "deps": attr.label_list(providers = ["proto"]),
++        "includes": attr.string_list(),
++        "protoc": attr.label(
++            cfg = "exec",
++            executable = True,
++            allow_single_file = True,
++            mandatory = True,
++        ),
++        "plugin": attr.label(
++            cfg = "exec",
++            allow_files = True,
++            executable = True,
++        ),
++        "plugin_language": attr.string(),
++        "plugin_options": attr.string_list(),
++        "gen_cc": attr.bool(),
++        "gen_py": attr.bool(),
++        "outs": attr.output_list(),
++    },
++    implementation = __proto_gen_impl,
++)
++
++"""Generates codes from Protocol Buffers definitions.
++
++This rule helps you to implement Skylark macros specific to the target
++language. You should prefer more specific `cc_proto_library `,
++`py_proto_library` and others unless you are adding such wrapper macros.
++
++Args:
++  srcs: Protocol Buffers definition files (.proto) to run the protocol compiler
++    against.
++  deps: a list of dependency labels; must be other proto libraries.
++  includes: a list of include paths to .proto files.
++  protoc: the label of the protocol compiler to generate the sources.
++  plugin: the label of the protocol compiler plugin to be passed to the protocol
++    compiler.
++  plugin_language: the language of the generated sources
++  plugin_options: a list of options to be passed to the plugin
++  gen_cc: generates C++ sources in addition to the ones from the plugin.
++  gen_py: generates Python sources in addition to the ones from the plugin.
++  outs: a list of labels of the expected outputs from the protocol compiler.
++"""
++
++def cc_proto_library(
++        name,
++        srcs = [],
++        deps = [],
++        cc_libs = [],
++        include = None,
++        protoc = "@com_google_protobuf//:protoc",
++        internal_bootstrap_hack = False,
++        use_grpc_plugin = False,
++        default_runtime = "@com_google_protobuf//:protobuf",
++        **kwargs):
++    """Bazel rule to create a C++ protobuf library from proto source files
++
++    NOTE: the rule is only an internal workaround to generate protos. The
++    interface may change and the rule may be removed when bazel has introduced
++    the native rule.
++
++    Args:
++      name: the name of the cc_proto_library.
++      srcs: the .proto files of the cc_proto_library.
++      deps: a list of dependency labels; must be cc_proto_library.
++      cc_libs: a list of other cc_library targets depended by the generated
++          cc_library.
++      include: a string indicating the include path of the .proto files.
++      protoc: the label of the protocol compiler to generate the sources.
++      internal_bootstrap_hack: a flag indicating if the cc_proto_library is used only
++          for bootstrapping. When it is set to True, no files will be generated.
++          The rule will simply be a provider for .proto files, so that other
++          cc_proto_library can depend on it.
++      use_grpc_plugin: a flag to indicate whether to call the grpc C++ plugin
++          when processing the proto files.
++      default_runtime: the implicitly default runtime which will be depended on by
++          the generated cc_library target.
++      **kwargs: other keyword arguments that are passed to cc_library.
++
++    """
++
++    includes = []
++    if include != None:
++        includes = [include]
++
++    if internal_bootstrap_hack:
++        # For pre-checked-in generated files, we add the internal_bootstrap_hack
++        # which will skip the codegen action.
++        proto_gen(
++            name = name + "_genproto",
++            srcs = srcs,
++            deps = [s + "_genproto" for s in deps],
++            includes = includes,
++            protoc = protoc,
++            visibility = ["//visibility:public"],
++        )
++
++        # An empty cc_library to make rule dependency consistent.
++        native.cc_library(
++            name = name,
++            **kwargs
++        )
++        return
++
++    grpc_cpp_plugin = None
++    if use_grpc_plugin:
++        grpc_cpp_plugin = "//external:grpc_cpp_plugin"
++
++    gen_srcs = _CcSrcs(srcs, use_grpc_plugin)
++    gen_hdrs = _CcHdrs(srcs, use_grpc_plugin)
++    outs = gen_srcs + gen_hdrs
++
++    proto_gen(
++        name = name + "_genproto",
++        srcs = srcs,
++        deps = [s + "_genproto" for s in deps],
++        includes = includes,
++        protoc = protoc,
++        plugin = grpc_cpp_plugin,
++        plugin_language = "grpc",
++        gen_cc = 1,
++        outs = outs,
++        visibility = ["//visibility:public"],
++    )
++
++    if default_runtime and not default_runtime in cc_libs:
++        cc_libs = cc_libs + [default_runtime]
++    if use_grpc_plugin:
++        cc_libs = cc_libs + ["//external:grpc_lib"]
++
++    native.cc_library(
++        name = name,
++        srcs = gen_srcs,
++        hdrs = gen_hdrs,
++        deps = cc_libs + deps,
++        includes = includes,
++        alwayslink = 1,
++        **kwargs
++    )
++
++"""Generates codes from Protocol Buffers definitions.
++
++This rule helps you to implement Skylark macros specific to the target
++language. You should prefer more specific `cc_proto_library `,
++`py_proto_library` and others unless you are adding such wrapper macros.
++
++Args:
++  srcs: Protocol Buffers definition files (.proto) to run the protocol compiler
++    against.
++  deps: a list of dependency labels; must be other proto libraries.
++  includes: a list of include paths to .proto files.
++  protoc: the label of the protocol compiler to generate the sources.
++  plugin: the label of the protocol compiler plugin to be passed to the protocol
++    compiler.
++  plugin_language: the language of the generated sources
++  plugin_options: a list of options to be passed to the plugin
++  gen_cc: generates C++ sources in addition to the ones from the plugin.
++  gen_py: generates Python sources in addition to the ones from the plugin.
++  outs: a list of labels of the expected outputs from the protocol compiler.
++"""
++
++def _adapt_proto_library_impl(ctx):
++    deps = [dep[ProtoInfo] for dep in ctx.attr.deps]
++
++    srcs = [src for dep in deps for src in dep.direct_sources]
++    return struct(
++        proto = struct(
++            srcs = srcs,
++            import_flags = ["-I{}".format(path) for dep in deps for path in dep.transitive_proto_path.to_list()],
++            deps = srcs,
++        ),
++    )
++
++adapt_proto_library = rule(
++    implementation = _adapt_proto_library_impl,
++    attrs = {
++        "deps": attr.label_list(
++            mandatory = True,
++            providers = [ProtoInfo],
++        ),
++    },
++    doc = "Adapts `proto_library` from `@rules_proto` to be used with `{cc,py}_proto_library` from this file.",
++)
diff --git a/bazel/patches/pytorch.patch b/bazel/patches/pytorch.patch
new file mode 100644
index 000000000..df63024e8
--- /dev/null
+++ b/bazel/patches/pytorch.patch
@@ -0,0 +1,44 @@
+--- a/torch/__init__.py
++++ b/torch/__init__.py
+@@ -172,6 +172,41 @@
+     here = os.path.abspath(__file__)
+     lib_path = os.path.join(os.path.dirname(here), 'lib', lib_name)
+ 
++    import pathlib
++    torch_root = pathlib.Path(here).parent.parent
++    packages = [
++        'cublas',
++        'cudnn',
++        'cuda_nvrtc',
++        'cuda_runtime',
++        'cuda_cupti',
++        'cufft',
++        'curand',
++        'cusolver',
++        'cusparse',
++        'nccl',
++        'nvjitlink',
++        'nvtx',
++    ]
++    rules_python_prefix = 'rules_python~~pip~spu_pip_dev_311_nvidia'
++    cuda_version = 'cu12'
++    nvidia_symlink_dir = torch_root / 'nvidia'
++    nvidia_symlink_dir.mkdir(exist_ok=True)
++    for pkg in packages:
++        pkg_dirname = f'../../../{rules_python_prefix}_{pkg}_{cuda_version}'
++        dest_dir = pathlib.Path(pkg_dirname) / f"site-packages/nvidia/{pkg}"
++        symlink_loc = nvidia_symlink_dir / pkg
++        if symlink_loc.exists():
++            assert symlink_loc.is_symlink()
++            if symlink_loc.readlink() != dest_dir:
++                symlink_loc.unlink()
++        if not symlink_loc.exists():
++            symlink_loc.symlink_to(dest_dir)
++
++    # Preload the correct libnvJitLink library. The other libraries don't need
++    # this because they're loaded via relative paths.
++    ctypes.CDLL(torch_root / 'nvidia/nvjitlink/lib/libnvJitLink.so.12', mode=ctypes.RTLD_GLOBAL)
++
+     try:
+         ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL)
+     except OSError as err:
diff --git a/bazel/patches/pytorch_record.patch b/bazel/patches/pytorch_record.patch
new file mode 100644
index 000000000..9a0f41efa
--- /dev/null
+++ b/bazel/patches/pytorch_record.patch
@@ -0,0 +1,34 @@
+--- a/torch-2.3.0.dist-info/RECORD
++++ b/torch-2.3.0.dist-info/RECORD
+@@ -10386,19 +10386,12 @@
+ torch/_compile.py,sha256=a2g6zXCXwnirEFu-VSiSzRfk23_-E0MgVbodLOHfjr0,1001
+ torch/_classes.py,sha256=zez2IGbpzN3f1P7Tg8s-fg3pz_ATN6hAVTxKsSqtV9o,1686
+ torch/_appdirs.py,sha256=GjuBh72l3BhGE4vJSdqGj-8QHjGbkhuMYaOLchLcqOQ,26167
+-torch/__init__.py,sha256=kCWXnQYOawq9ORqPg-5cLOnMwoAboxlI8PQcgqwHJzA,79997
++torch/__init__.py,sha256=2KMPP6IBfFqD8r02xTXzVtFPnQQz9Yn-fBfO9QBs7Qk,81211
+ torch/__future__.py,sha256=yk9l_KWsfVIzUBx9cGr-OdtWmb-pI8ZhcROAm3a_FQw,3185
+ torch/__config__.py,sha256=kv8yDflHiu3B2rxjOe48upLeB2VXnTxdXKeAYcxnJ5c,553
+ torch/_VF.pyi,sha256=kmuEzpodTqrBJhZhx054yhGBqizo80StmroZOGZz-dI,1137705
+ torch/_VF.py,sha256=6gWebiEvyG5GFiNTcMuigU7UAPEesYJmWKQTL_1GTrM,643
+ torch/_C.cpython-311-x86_64-linux-gnu.so,sha256=DgQrQZ1UWG8CqyFAOIrkFeOBbdRsL8uRH7R6LYN1GR4,37857
+-torch-2.3.0.dist-info/LICENSE,sha256=nCkW1Dsriv9qPGZi7VhLay19bc-E3XJxgd7q3FRyIU4,351851
+-torch-2.3.0.dist-info/METADATA,sha256=h6XMrDq7R7q01HNfqNNgyDKXDn2p4vfbUB2IxCL7gdY,26120
+-torch-2.3.0.dist-info/NOTICE,sha256=wsx78MrsdlLCtGCopHC-oWd_JB5KuOQx3zTPF_Wp_sA,23632
+-torch-2.3.0.dist-info/WHEEL,sha256=heXqORHgAzVHsSEcHHvvTohM_YB7MmN0HUQQME76CM0,105
+-torch-2.3.0.dist-info/entry_points.txt,sha256=SRhyGhohzXtwg-GPZHrgubQLqk1v5i-kOQiV-mj5fms,296
+-torch-2.3.0.dist-info/top_level.txt,sha256=MsBcfJyMU15lW1efu5w7Tzd4MenrYHiuaixbHMfAoco,25
+-torch-2.3.0.dist-info/RECORD,,
+ torchgen/static_runtime/generator.py,sha256=Sfe8TR6inZv-FHhJAG6gXBFXutPVMO8I4Uzw3xnGdd4,26374
+ torchgen/static_runtime/gen_static_runtime_ops.py,sha256=Esr32XW78YkLpGAGhIw6ZsxmkwgI_Pd3j0rh585ijow,7347
+ torchgen/static_runtime/config.py,sha256=fEMB4EdO8aX47aW13s-nVSy-yM5qIzfXrOYaQkmv3-A,14493
+@@ -10538,3 +10531,10 @@
+ torchgen/context.py,sha256=798e45g0zoR69Xn_4HjTuBRXbuNnyyK-j5_vWOnrs_s,3974
+ torchgen/code_template.py,sha256=z3N3FvXHfvO2aLIu2LoFqe7XGpzbFfzXEyzhGez2KME,2903
+ torchgen/__init__.py,sha256=iirTpG38WcCsNMhEbi1dg7_jad6ptk_uzZ-BzaGBFyU,348
++torch-2.3.0.dist-info/LICENSE,sha256=nCkW1Dsriv9qPGZi7VhLay19bc-E3XJxgd7q3FRyIU4,351851
++torch-2.3.0.dist-info/METADATA,sha256=h6XMrDq7R7q01HNfqNNgyDKXDn2p4vfbUB2IxCL7gdY,26120
++torch-2.3.0.dist-info/NOTICE,sha256=wsx78MrsdlLCtGCopHC-oWd_JB5KuOQx3zTPF_Wp_sA,23632
++torch-2.3.0.dist-info/WHEEL,sha256=heXqORHgAzVHsSEcHHvvTohM_YB7MmN0HUQQME76CM0,105
++torch-2.3.0.dist-info/entry_points.txt,sha256=SRhyGhohzXtwg-GPZHrgubQLqk1v5i-kOQiV-mj5fms,296
++torch-2.3.0.dist-info/top_level.txt,sha256=MsBcfJyMU15lW1efu5w7Tzd4MenrYHiuaixbHMfAoco,25
++torch-2.3.0.dist-info/RECORD,,
diff --git a/bazel/patches/xla-non-hermetic-python.patch b/bazel/patches/xla-non-hermetic-python.patch
deleted file mode 100644
index ac1b0cc00..000000000
--- a/bazel/patches/xla-non-hermetic-python.patch
+++ /dev/null
@@ -1,786 +0,0 @@
-diff --git a/third_party/py/BUILD.tpl b/third_party/py/BUILD.tpl
-index 7cc1e08568..45480bd4a3 100644
---- a/third_party/py/BUILD.tpl
-+++ b/third_party/py/BUILD.tpl
-@@ -5,17 +5,16 @@ package(default_visibility = ["//visibility:public"])
- # Point both runtimes to the same python binary to ensure we always
- # use the python binary specified by ./configure.py script.
- load("@bazel_tools//tools/python:toolchain.bzl", "py_runtime_pair")
--load("@python//:defs.bzl", "interpreter")
-
- py_runtime(
-     name = "py2_runtime",
--    interpreter_path = interpreter,
-+    interpreter_path = "%{PYTHON_BIN_PATH}",
-     python_version = "PY2",
- )
-
- py_runtime(
-     name = "py3_runtime",
--    interpreter_path = interpreter,
-+    interpreter_path = "%{PYTHON_BIN_PATH}",
-     python_version = "PY3",
- )
-
-@@ -33,8 +32,27 @@ toolchain(
-     exec_compatible_with = [%{PLATFORM_CONSTRAINT}],
- )
-
--alias(name = "python_headers",
--      actual = "@python//:python_headers")
-+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-+# See https://docs.python.org/3/extending/windows.html
-+cc_import(
-+    name = "python_lib",
-+    interface_library = select({
-+        ":windows": ":python_import_lib",
-+        # A placeholder for Unix platforms which makes --no_build happy.
-+        "//conditions:default": "not-existing.lib",
-+    }),
-+    system_provided = 1,
-+)
-+
-+cc_library(
-+    name = "python_headers",
-+    hdrs = [":python_include"],
-+    deps = select({
-+        ":windows": [":python_lib"],
-+        "//conditions:default": [],
-+    }),
-+    includes = ["python_include"],
-+)
-
- # This alias is exists for the use of targets in the @llvm-project dependency,
- # which expect a python_headers target called @python_runtime//:headers. We use
-@@ -45,9 +63,18 @@ alias(
-     actual = ":python_headers",
- )
-
-+cc_library(
-+    name = "numpy_headers",
-+    hdrs = [":numpy_include"],
-+    includes = ["numpy_include"],
-+)
-
- config_setting(
-     name = "windows",
-     values = {"cpu": "x64_windows"},
-     visibility = ["//visibility:public"],
--)
-\ No newline at end of file
-+)
-+
-+%{PYTHON_INCLUDE_GENRULE}
-+%{NUMPY_INCLUDE_GENRULE}
-+%{PYTHON_IMPORT_LIB_GENRULE}
-\ No newline at end of file
-diff --git a/third_party/py/numpy/BUILD b/third_party/py/numpy/BUILD
-index 97c7907fc3..c80cc5287b 100644
---- a/third_party/py/numpy/BUILD
-+++ b/third_party/py/numpy/BUILD
-@@ -2,14 +2,15 @@ licenses(["restricted"])
-
- package(default_visibility = ["//visibility:public"])
-
--alias(
-+py_library(
-     name = "numpy",
--    actual = "@pypi_numpy//:pkg",
-+    srcs = ["tf_numpy_dummy.py"],
-+    srcs_version = "PY3",
- )
-
- alias(
-     name = "headers",
--    actual = "@pypi_numpy//:numpy_headers",
-+    actual = "@local_config_python//:numpy_headers",
- )
-
- genrule(
-diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
-index 3728a91b93..89732c3e33 100644
---- a/third_party/py/python_configure.bzl
-+++ b/third_party/py/python_configure.bzl
-@@ -1,4 +1,9 @@
- """Repository rule for Python autoconfiguration.
-+
-+`python_configure` depends on the following environment variables:
-+
-+  * `PYTHON_BIN_PATH`: location of python binary.
-+  * `PYTHON_LIB_PATH`: Location of python libraries.
- """
-
- load(
-@@ -6,8 +11,192 @@ load(
-     "BAZEL_SH",
-     "PYTHON_BIN_PATH",
-     "PYTHON_LIB_PATH",
-+    "TF_PYTHON_CONFIG_REPO",
-+    "auto_config_fail",
-+    "config_repo_label",
-+    "execute",
-+    "get_bash_bin",
-+    "get_host_environ",
-+    "get_python_bin",
-+    "is_windows",
-+    "raw_exec",
-+    "read_dir",
- )
-
-+def _genrule(src_dir, genrule_name, command, outs):
-+    """Returns a string with a genrule.
-+
-+    Genrule executes the given command and produces the given outputs.
-+    """
-+    return (
-+        "genrule(\n" +
-+        '    name = "' +
-+        genrule_name + '",\n' +
-+        "    outs = [\n" +
-+        outs +
-+        "\n    ],\n" +
-+        '    cmd = """\n' +
-+        command +
-+        '\n   """,\n' +
-+        ")\n"
-+    )
-+
-+def _norm_path(path):
-+    """Returns a path with '/' and remove the trailing slash."""
-+    path = path.replace("\\", "/")
-+    if path[-1] == "/":
-+        path = path[:-1]
-+    return path
-+
-+def _symlink_genrule_for_dir(
-+        repository_ctx,
-+        src_dir,
-+        dest_dir,
-+        genrule_name,
-+        src_files = [],
-+        dest_files = []):
-+    """Returns a genrule to symlink(or copy if on Windows) a set of files.
-+
-+    If src_dir is passed, files will be read from the given directory; otherwise
-+    we assume files are in src_files and dest_files
-+    """
-+    if src_dir != None:
-+        src_dir = _norm_path(src_dir)
-+        dest_dir = _norm_path(dest_dir)
-+        files = "\n".join(read_dir(repository_ctx, src_dir))
-+
-+        # Create a list with the src_dir stripped to use for outputs.
-+        dest_files = files.replace(src_dir, "").splitlines()
-+        src_files = files.splitlines()
-+    command = []
-+    outs = []
-+    for i in range(len(dest_files)):
-+        if dest_files[i] != "":
-+            # If we have only one file to link we do not want to use the dest_dir, as
-+            # $(@D) will include the full path to the file.
-+            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
-+
-+            # Copy the headers to create a sandboxable setup.
-+            cmd = "cp -f"
-+            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-+            outs.append('        "' + dest_dir + dest_files[i] + '",')
-+    genrule = _genrule(
-+        src_dir,
-+        genrule_name,
-+        " && ".join(command),
-+        "\n".join(outs),
-+    )
-+    return genrule
-+
-+def _get_python_lib(repository_ctx, python_bin):
-+    """Gets the python lib path."""
-+    python_lib = get_host_environ(repository_ctx, PYTHON_LIB_PATH)
-+    if python_lib != None:
-+        return python_lib
-+
-+    # The interesting program to execute.
-+    print_lib = [
-+        "from __future__ import print_function",
-+        "import site",
-+        "import os",
-+        "python_paths = []",
-+        "if os.getenv('PYTHONPATH') is not None:",
-+        "  python_paths = os.getenv('PYTHONPATH').split(':')",
-+        "try:",
-+        "  library_paths = site.getsitepackages()",
-+        "except AttributeError:",
-+        "  from distutils.sysconfig import get_python_lib",
-+        "  library_paths = [get_python_lib()]",
-+        "all_paths = set(python_paths + library_paths)",
-+        "paths = []",
-+        "for path in all_paths:",
-+        "  if os.path.isdir(path):",
-+        "    paths.append(path)",
-+        "if len(paths) >=1:",
-+        "  print(paths[0])",
-+    ]
-+
-+    # The below script writes the above program to a file
-+    # and executes it. This is to work around the limitation
-+    # of not being able to upload files as part of execute.
-+    cmd = "from os import linesep;"
-+    cmd += "f = open('script.py', 'w');"
-+    for line in print_lib:
-+        cmd += "f.write(\"%s\" + linesep);" % line
-+    cmd += "f.close();"
-+    cmd += "from subprocess import call;"
-+    cmd += "call([\"%s\", \"script.py\"]);" % python_bin
-+
-+    result = execute(repository_ctx, [python_bin, "-c", cmd])
-+    return result.stdout.strip()
-+
-+def _check_python_lib(repository_ctx, python_lib):
-+    """Checks the python lib path."""
-+    cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-+    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-+    if result.return_code == 1:
-+        auto_config_fail("Invalid python library path: %s" % python_lib)
-+
-+def _check_python_bin(repository_ctx, python_bin):
-+    """Checks the python bin path."""
-+    cmd = '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-+    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-+    if result.return_code == 1:
-+        auto_config_fail("--define %s='%s' is not executable. Is it the python binary?" % (
-+            PYTHON_BIN_PATH,
-+            python_bin,
-+        ))
-+
-+def _get_python_include(repository_ctx, python_bin):
-+    """Gets the python include path."""
-+    result = execute(
-+        repository_ctx,
-+        [
-+            python_bin,
-+            "-Wignore",
-+            "-c",
-+            "import sysconfig; " +
-+            "print(sysconfig.get_path('include'))",
-+        ],
-+        error_msg = "Problem getting python include path.",
-+        error_details = ("Is the Python binary path set up right? " +
-+                         "(See ./configure or " + PYTHON_BIN_PATH + ".) " +
-+                         "Is distutils installed?"),
-+    )
-+    return result.stdout.splitlines()[0]
-+
-+def _get_python_import_lib_name(repository_ctx, python_bin):
-+    """Get Python import library name (pythonXY.lib) on Windows."""
-+    result = execute(
-+        repository_ctx,
-+        [
-+            python_bin,
-+            "-c",
-+            "import sys;" +
-+            'print("python" + str(sys.version_info[0]) + ' +
-+            '      str(sys.version_info[1]) + ".lib")',
-+        ],
-+        error_msg = "Problem getting python import library.",
-+        error_details = ("Is the Python binary path set up right? " +
-+                         "(See ./configure or " + PYTHON_BIN_PATH + ".) "),
-+    )
-+    return result.stdout.splitlines()[0]
-+
-+def _get_numpy_include(repository_ctx, python_bin):
-+    """Gets the numpy include path."""
-+    return execute(
-+        repository_ctx,
-+        [
-+            python_bin,
-+            "-c",
-+            "from __future__ import print_function;" +
-+            "import numpy;" +
-+            " print(numpy.get_include());",
-+        ],
-+        error_msg = "Problem getting numpy include path.",
-+        error_details = "Is numpy installed?",
-+    ).stdout.splitlines()[0]
-+
- def _create_local_python_repository(repository_ctx):
-     """Creates the repository containing files set up to build with Python."""
-
-@@ -15,14 +204,68 @@ def _create_local_python_repository(repository_ctx):
-     # function to be restarted with all previous state being lost. This
-     # can easily lead to a O(n^2) runtime in the number of labels.
-     build_tpl = repository_ctx.path(Label("//third_party/py:BUILD.tpl"))
-+
-+    python_bin = get_python_bin(repository_ctx)
-+    _check_python_bin(repository_ctx, python_bin)
-+    python_lib = _get_python_lib(repository_ctx, python_bin)
-+    _check_python_lib(repository_ctx, python_lib)
-+    python_include = _get_python_include(repository_ctx, python_bin)
-+    numpy_include = _get_numpy_include(repository_ctx, python_bin) + "/numpy"
-+    python_include_rule = _symlink_genrule_for_dir(
-+        repository_ctx,
-+        python_include,
-+        "python_include",
-+        "python_include",
-+    )
-+    python_import_lib_genrule = ""
-+
-+    # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-+    # See https://docs.python.org/3/extending/windows.html
-+    if is_windows(repository_ctx):
-+        python_bin = python_bin.replace("\\", "/")
-+        python_include = _norm_path(python_include)
-+        python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
-+        python_import_lib_src = python_include.rsplit("/", 1)[0] + "/libs/" + python_import_lib_name
-+        python_import_lib_genrule = _symlink_genrule_for_dir(
-+            repository_ctx,
-+            None,
-+            "",
-+            "python_import_lib",
-+            [python_import_lib_src],
-+            [python_import_lib_name],
-+        )
-+    numpy_include_rule = _symlink_genrule_for_dir(
-+        repository_ctx,
-+        numpy_include,
-+        "numpy_include/numpy",
-+        "numpy_include",
-+    )
-+
-     platform_constraint = ""
-     if repository_ctx.attr.platform_constraint:
-         platform_constraint = "\"%s\"" % repository_ctx.attr.platform_constraint
--    repository_ctx.template("BUILD", build_tpl, {"%{PLATFORM_CONSTRAINT}": platform_constraint})
-+    repository_ctx.template("BUILD", build_tpl, {
-+        "%{PYTHON_BIN_PATH}": python_bin,
-+        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
-+        "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
-+        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
-+        "%{PLATFORM_CONSTRAINT}": platform_constraint,
-+    })
-+
-+def _create_remote_python_repository(repository_ctx, remote_config_repo):
-+    """Creates pointers to a remotely configured repo set up to build with Python.
-+    """
-+    repository_ctx.template("BUILD", config_repo_label(remote_config_repo, ":BUILD"), {})
-
- def _python_autoconf_impl(repository_ctx):
-     """Implementation of the python_autoconf repository rule."""
--    _create_local_python_repository(repository_ctx)
-+    if get_host_environ(repository_ctx, TF_PYTHON_CONFIG_REPO) != None:
-+        _create_remote_python_repository(
-+            repository_ctx,
-+            get_host_environ(repository_ctx, TF_PYTHON_CONFIG_REPO),
-+        )
-+    else:
-+        _create_local_python_repository(repository_ctx)
-
- _ENVIRONS = [
-     BAZEL_SH,
-@@ -32,6 +275,7 @@ _ENVIRONS = [
-
- local_python_configure = repository_rule(
-     implementation = _create_local_python_repository,
-+    environ = _ENVIRONS,
-     attrs = {
-         "environ": attr.string_dict(),
-         "platform_constraint": attr.string(),
-@@ -50,6 +294,7 @@ remote_python_configure = repository_rule(
-
- python_configure = repository_rule(
-     implementation = _python_autoconf_impl,
-+    environ = _ENVIRONS + [TF_PYTHON_CONFIG_REPO],
-     attrs = {
-         "platform_constraint": attr.string(),
-     },
-diff --git a/third_party/tsl/third_party/py/BUILD.tpl b/third_party/tsl/third_party/py/BUILD.tpl
-index 7cc1e08568..45480bd4a3 100644
---- a/third_party/tsl/third_party/py/BUILD.tpl
-+++ b/third_party/tsl/third_party/py/BUILD.tpl
-@@ -5,17 +5,16 @@ package(default_visibility = ["//visibility:public"])
- # Point both runtimes to the same python binary to ensure we always
- # use the python binary specified by ./configure.py script.
- load("@bazel_tools//tools/python:toolchain.bzl", "py_runtime_pair")
--load("@python//:defs.bzl", "interpreter")
-
- py_runtime(
-     name = "py2_runtime",
--    interpreter_path = interpreter,
-+    interpreter_path = "%{PYTHON_BIN_PATH}",
-     python_version = "PY2",
- )
-
- py_runtime(
-     name = "py3_runtime",
--    interpreter_path = interpreter,
-+    interpreter_path = "%{PYTHON_BIN_PATH}",
-     python_version = "PY3",
- )
-
-@@ -33,8 +32,27 @@ toolchain(
-     exec_compatible_with = [%{PLATFORM_CONSTRAINT}],
- )
-
--alias(name = "python_headers",
--      actual = "@python//:python_headers")
-+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-+# See https://docs.python.org/3/extending/windows.html
-+cc_import(
-+    name = "python_lib",
-+    interface_library = select({
-+        ":windows": ":python_import_lib",
-+        # A placeholder for Unix platforms which makes --no_build happy.
-+        "//conditions:default": "not-existing.lib",
-+    }),
-+    system_provided = 1,
-+)
-+
-+cc_library(
-+    name = "python_headers",
-+    hdrs = [":python_include"],
-+    deps = select({
-+        ":windows": [":python_lib"],
-+        "//conditions:default": [],
-+    }),
-+    includes = ["python_include"],
-+)
-
- # This alias is exists for the use of targets in the @llvm-project dependency,
- # which expect a python_headers target called @python_runtime//:headers. We use
-@@ -45,9 +63,18 @@ alias(
-     actual = ":python_headers",
- )
-
-+cc_library(
-+    name = "numpy_headers",
-+    hdrs = [":numpy_include"],
-+    includes = ["numpy_include"],
-+)
-
- config_setting(
-     name = "windows",
-     values = {"cpu": "x64_windows"},
-     visibility = ["//visibility:public"],
--)
-\ No newline at end of file
-+)
-+
-+%{PYTHON_INCLUDE_GENRULE}
-+%{NUMPY_INCLUDE_GENRULE}
-+%{PYTHON_IMPORT_LIB_GENRULE}
-\ No newline at end of file
-diff --git a/third_party/tsl/third_party/py/numpy/BUILD b/third_party/tsl/third_party/py/numpy/BUILD
-index 97c7907fc3..c80cc5287b 100644
---- a/third_party/tsl/third_party/py/numpy/BUILD
-+++ b/third_party/tsl/third_party/py/numpy/BUILD
-@@ -2,14 +2,15 @@ licenses(["restricted"])
-
- package(default_visibility = ["//visibility:public"])
-
--alias(
-+py_library(
-     name = "numpy",
--    actual = "@pypi_numpy//:pkg",
-+    srcs = ["tf_numpy_dummy.py"],
-+    srcs_version = "PY3",
- )
-
- alias(
-     name = "headers",
--    actual = "@pypi_numpy//:numpy_headers",
-+    actual = "@local_config_python//:numpy_headers",
- )
-
- genrule(
-diff --git a/third_party/tsl/third_party/py/python_configure.bzl b/third_party/tsl/third_party/py/python_configure.bzl
-index 3728a91b93..89732c3e33 100644
---- a/third_party/tsl/third_party/py/python_configure.bzl
-+++ b/third_party/tsl/third_party/py/python_configure.bzl
-@@ -1,4 +1,9 @@
- """Repository rule for Python autoconfiguration.
-+
-+`python_configure` depends on the following environment variables:
-+
-+  * `PYTHON_BIN_PATH`: location of python binary.
-+  * `PYTHON_LIB_PATH`: Location of python libraries.
- """
-
- load(
-@@ -6,8 +11,192 @@ load(
-     "BAZEL_SH",
-     "PYTHON_BIN_PATH",
-     "PYTHON_LIB_PATH",
-+    "TF_PYTHON_CONFIG_REPO",
-+    "auto_config_fail",
-+    "config_repo_label",
-+    "execute",
-+    "get_bash_bin",
-+    "get_host_environ",
-+    "get_python_bin",
-+    "is_windows",
-+    "raw_exec",
-+    "read_dir",
- )
-
-+def _genrule(src_dir, genrule_name, command, outs):
-+    """Returns a string with a genrule.
-+
-+    Genrule executes the given command and produces the given outputs.
-+    """
-+    return (
-+        "genrule(\n" +
-+        '    name = "' +
-+        genrule_name + '",\n' +
-+        "    outs = [\n" +
-+        outs +
-+        "\n    ],\n" +
-+        '    cmd = """\n' +
-+        command +
-+        '\n   """,\n' +
-+        ")\n"
-+    )
-+
-+def _norm_path(path):
-+    """Returns a path with '/' and remove the trailing slash."""
-+    path = path.replace("\\", "/")
-+    if path[-1] == "/":
-+        path = path[:-1]
-+    return path
-+
-+def _symlink_genrule_for_dir(
-+        repository_ctx,
-+        src_dir,
-+        dest_dir,
-+        genrule_name,
-+        src_files = [],
-+        dest_files = []):
-+    """Returns a genrule to symlink(or copy if on Windows) a set of files.
-+
-+    If src_dir is passed, files will be read from the given directory; otherwise
-+    we assume files are in src_files and dest_files
-+    """
-+    if src_dir != None:
-+        src_dir = _norm_path(src_dir)
-+        dest_dir = _norm_path(dest_dir)
-+        files = "\n".join(read_dir(repository_ctx, src_dir))
-+
-+        # Create a list with the src_dir stripped to use for outputs.
-+        dest_files = files.replace(src_dir, "").splitlines()
-+        src_files = files.splitlines()
-+    command = []
-+    outs = []
-+    for i in range(len(dest_files)):
-+        if dest_files[i] != "":
-+            # If we have only one file to link we do not want to use the dest_dir, as
-+            # $(@D) will include the full path to the file.
-+            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
-+
-+            # Copy the headers to create a sandboxable setup.
-+            cmd = "cp -f"
-+            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-+            outs.append('        "' + dest_dir + dest_files[i] + '",')
-+    genrule = _genrule(
-+        src_dir,
-+        genrule_name,
-+        " && ".join(command),
-+        "\n".join(outs),
-+    )
-+    return genrule
-+
-+def _get_python_lib(repository_ctx, python_bin):
-+    """Gets the python lib path."""
-+    python_lib = get_host_environ(repository_ctx, PYTHON_LIB_PATH)
-+    if python_lib != None:
-+        return python_lib
-+
-+    # The interesting program to execute.
-+    print_lib = [
-+        "from __future__ import print_function",
-+        "import site",
-+        "import os",
-+        "python_paths = []",
-+        "if os.getenv('PYTHONPATH') is not None:",
-+        "  python_paths = os.getenv('PYTHONPATH').split(':')",
-+        "try:",
-+        "  library_paths = site.getsitepackages()",
-+        "except AttributeError:",
-+        "  from distutils.sysconfig import get_python_lib",
-+        "  library_paths = [get_python_lib()]",
-+        "all_paths = set(python_paths + library_paths)",
-+        "paths = []",
-+        "for path in all_paths:",
-+        "  if os.path.isdir(path):",
-+        "    paths.append(path)",
-+        "if len(paths) >=1:",
-+        "  print(paths[0])",
-+    ]
-+
-+    # The below script writes the above program to a file
-+    # and executes it. This is to work around the limitation
-+    # of not being able to upload files as part of execute.
-+    cmd = "from os import linesep;"
-+    cmd += "f = open('script.py', 'w');"
-+    for line in print_lib:
-+        cmd += "f.write(\"%s\" + linesep);" % line
-+    cmd += "f.close();"
-+    cmd += "from subprocess import call;"
-+    cmd += "call([\"%s\", \"script.py\"]);" % python_bin
-+
-+    result = execute(repository_ctx, [python_bin, "-c", cmd])
-+    return result.stdout.strip()
-+
-+def _check_python_lib(repository_ctx, python_lib):
-+    """Checks the python lib path."""
-+    cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-+    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-+    if result.return_code == 1:
-+        auto_config_fail("Invalid python library path: %s" % python_lib)
-+
-+def _check_python_bin(repository_ctx, python_bin):
-+    """Checks the python bin path."""
-+    cmd = '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-+    result = raw_exec(repository_ctx, [get_bash_bin(repository_ctx), "-c", cmd])
-+    if result.return_code == 1:
-+        auto_config_fail("--define %s='%s' is not executable. Is it the python binary?" % (
-+            PYTHON_BIN_PATH,
-+            python_bin,
-+        ))
-+
-+def _get_python_include(repository_ctx, python_bin):
-+    """Gets the python include path."""
-+    result = execute(
-+        repository_ctx,
-+        [
-+            python_bin,
-+            "-Wignore",
-+            "-c",
-+            "import sysconfig; " +
-+            "print(sysconfig.get_path('include'))",
-+        ],
-+        error_msg = "Problem getting python include path.",
-+        error_details = ("Is the Python binary path set up right? " +
-+                         "(See ./configure or " + PYTHON_BIN_PATH + ".) " +
-+                         "Is distutils installed?"),
-+    )
-+    return result.stdout.splitlines()[0]
-+
-+def _get_python_import_lib_name(repository_ctx, python_bin):
-+    """Get Python import library name (pythonXY.lib) on Windows."""
-+    result = execute(
-+        repository_ctx,
-+        [
-+            python_bin,
-+            "-c",
-+            "import sys;" +
-+            'print("python" + str(sys.version_info[0]) + ' +
-+            '      str(sys.version_info[1]) + ".lib")',
-+        ],
-+        error_msg = "Problem getting python import library.",
-+        error_details = ("Is the Python binary path set up right? " +
-+                         "(See ./configure or " + PYTHON_BIN_PATH + ".) "),
-+    )
-+    return result.stdout.splitlines()[0]
-+
-+def _get_numpy_include(repository_ctx, python_bin):
-+    """Gets the numpy include path."""
-+    return execute(
-+        repository_ctx,
-+        [
-+            python_bin,
-+            "-c",
-+            "from __future__ import print_function;" +
-+            "import numpy;" +
-+            " print(numpy.get_include());",
-+        ],
-+        error_msg = "Problem getting numpy include path.",
-+        error_details = "Is numpy installed?",
-+    ).stdout.splitlines()[0]
-+
- def _create_local_python_repository(repository_ctx):
-     """Creates the repository containing files set up to build with Python."""
-
-@@ -15,14 +204,68 @@ def _create_local_python_repository(repository_ctx):
-     # function to be restarted with all previous state being lost. This
-     # can easily lead to a O(n^2) runtime in the number of labels.
-     build_tpl = repository_ctx.path(Label("//third_party/py:BUILD.tpl"))
-+
-+    python_bin = get_python_bin(repository_ctx)
-+    _check_python_bin(repository_ctx, python_bin)
-+    python_lib = _get_python_lib(repository_ctx, python_bin)
-+    _check_python_lib(repository_ctx, python_lib)
-+    python_include = _get_python_include(repository_ctx, python_bin)
-+    numpy_include = _get_numpy_include(repository_ctx, python_bin) + "/numpy"
-+    python_include_rule = _symlink_genrule_for_dir(
-+        repository_ctx,
-+        python_include,
-+        "python_include",
-+        "python_include",
-+    )
-+    python_import_lib_genrule = ""
-+
-+    # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-+    # See https://docs.python.org/3/extending/windows.html
-+    if is_windows(repository_ctx):
-+        python_bin = python_bin.replace("\\", "/")
-+        python_include = _norm_path(python_include)
-+        python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
-+        python_import_lib_src = python_include.rsplit("/", 1)[0] + "/libs/" + python_import_lib_name
-+        python_import_lib_genrule = _symlink_genrule_for_dir(
-+            repository_ctx,
-+            None,
-+            "",
-+            "python_import_lib",
-+            [python_import_lib_src],
-+            [python_import_lib_name],
-+        )
-+    numpy_include_rule = _symlink_genrule_for_dir(
-+        repository_ctx,
-+        numpy_include,
-+        "numpy_include/numpy",
-+        "numpy_include",
-+    )
-+
-     platform_constraint = ""
-     if repository_ctx.attr.platform_constraint:
-         platform_constraint = "\"%s\"" % repository_ctx.attr.platform_constraint
--    repository_ctx.template("BUILD", build_tpl, {"%{PLATFORM_CONSTRAINT}": platform_constraint})
-+    repository_ctx.template("BUILD", build_tpl, {
-+        "%{PYTHON_BIN_PATH}": python_bin,
-+        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
-+        "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
-+        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
-+        "%{PLATFORM_CONSTRAINT}": platform_constraint,
-+    })
-+
-+def _create_remote_python_repository(repository_ctx, remote_config_repo):
-+    """Creates pointers to a remotely configured repo set up to build with Python.
-+    """
-+    repository_ctx.template("BUILD", config_repo_label(remote_config_repo, ":BUILD"), {})
-
- def _python_autoconf_impl(repository_ctx):
-     """Implementation of the python_autoconf repository rule."""
--    _create_local_python_repository(repository_ctx)
-+    if get_host_environ(repository_ctx, TF_PYTHON_CONFIG_REPO) != None:
-+        _create_remote_python_repository(
-+            repository_ctx,
-+            get_host_environ(repository_ctx, TF_PYTHON_CONFIG_REPO),
-+        )
-+    else:
-+        _create_local_python_repository(repository_ctx)
-
- _ENVIRONS = [
-     BAZEL_SH,
-@@ -32,6 +275,7 @@ _ENVIRONS = [
-
- local_python_configure = repository_rule(
-     implementation = _create_local_python_repository,
-+    environ = _ENVIRONS,
-     attrs = {
-         "environ": attr.string_dict(),
-         "platform_constraint": attr.string(),
-@@ -50,6 +294,7 @@ remote_python_configure = repository_rule(
-
- python_configure = repository_rule(
-     implementation = _python_autoconf_impl,
-+    environ = _ENVIRONS + [TF_PYTHON_CONFIG_REPO],
-     attrs = {
-         "platform_constraint": attr.string(),
-     },
diff --git a/bazel/repositories.bzl b/bazel/repositories.bzl
index c032474c9..e33ea1526 100644
--- a/bazel/repositories.bzl
+++ b/bazel/repositories.bzl
@@ -16,91 +16,13 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
 
 def spu_deps():
-    _bazel_skylib()
-    _rules_cuda()
-    _rules_proto_grpc()
-    _bazel_platform()
     _com_github_xtensor_xtensor()
     _com_github_xtensor_xtl()
-    _com_github_openxla_xla()
-    _com_github_pybind11_bazel()
-    _com_github_pybind11()
-    _com_intel_hexl()
-    _com_github_emptoolkit_emp_tool()
-    _com_github_emptoolkit_emp_ot()
-    _com_github_facebook_zstd()
-    _com_github_eigenteam_eigen()
-    _com_github_nvidia_cutlass()
-    _yacl()
-    _libpsi()
-
-def _yacl():
-    maybe(
-        http_archive,
-        name = "yacl",
-        urls = [
-            "https://github.com/secretflow/yacl/archive/refs/tags/0.4.5b8_nightly_20241014.tar.gz",
-        ],
-        strip_prefix = "yacl-0.4.5b8_nightly_20241014",
-        sha256 = "9141792f07eba507ffd21c57ec3df2ad5fdf90ce605ffb7bc1b7b4e84a9c34fa",
-    )
-
-def _libpsi():
-    maybe(
-        http_archive,
-        name = "psi",
-        urls = [
-            "https://github.com/secretflow/psi/archive/refs/tags/v0.5.0.dev241115.tar.gz",
-        ],
-        strip_prefix = "psi-0.5.0.dev241115",
-        sha256 = "4d5ccc61282c4f887cee2c12fe3f414dfd7e916952849e92ffb1f6835d657a35",
-    )
-
-def _rules_proto_grpc():
-    http_archive(
-        name = "rules_proto_grpc",
-        sha256 = "2a0860a336ae836b54671cbbe0710eec17c64ef70c4c5a88ccfd47ea6e3739bd",
-        strip_prefix = "rules_proto_grpc-4.6.0",
-        urls = [
-            "https://github.com/rules-proto-grpc/rules_proto_grpc/releases/download/4.6.0/rules_proto_grpc-4.6.0.tar.gz",
-        ],
-    )
-
-def _rules_cuda():
-    http_archive(
-        name = "rules_cuda",
-        sha256 = "c92b334d769a07cd991b7675b2f6076b8b95cd3b28b14268a2f379f8baae58e0",
-        strip_prefix = "rules_cuda-v0.2.3",
-        urls = ["https://github.com/bazel-contrib/rules_cuda/releases/download/v0.2.3/rules_cuda-v0.2.3.tar.gz"],
-    )
-
-def _bazel_platform():
-    http_archive(
-        name = "platforms",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.10/platforms-0.0.10.tar.gz",
-            "https://github.com/bazelbuild/platforms/releases/download/0.0.10/platforms-0.0.10.tar.gz",
-        ],
-        sha256 = "218efe8ee736d26a3572663b374a253c012b716d8af0c07e842e82f238a0a7ee",
-    )
-
-def _com_github_facebook_zstd():
-    maybe(
-        http_archive,
-        name = "com_github_facebook_zstd",
-        build_file = "@spulib//bazel:zstd.BUILD",
-        strip_prefix = "zstd-1.5.6",
-        sha256 = "30f35f71c1203369dc979ecde0400ffea93c27391bfd2ac5a9715d2173d92ff7",
-        type = ".tar.gz",
-        urls = [
-            "https://github.com/facebook/zstd/archive/refs/tags/v1.5.6.tar.gz",
-        ],
-    )
 
 def _com_github_xtensor_xtensor():
     maybe(
         http_archive,
-        name = "com_github_xtensor_xtensor",
+        name = "xtensor",
         sha256 = "32d5d9fd23998c57e746c375a544edf544b74f0a18ad6bc3c38cbba968d5e6c7",
         strip_prefix = "xtensor-0.25.0",
         build_file = "@spulib//bazel:xtensor.BUILD",
@@ -113,7 +35,7 @@ def _com_github_xtensor_xtensor():
 def _com_github_xtensor_xtl():
     maybe(
         http_archive,
-        name = "com_github_xtensor_xtl",
+        name = "xtl",
         sha256 = "44fb99fbf5e56af5c43619fc8c29aa58e5fad18f3ba6e7d9c55c111b62df1fbb",
         strip_prefix = "xtl-0.7.7",
         build_file = "@spulib//bazel:xtl.BUILD",
@@ -122,130 +44,3 @@ def _com_github_xtensor_xtl():
             "https://github.com/xtensor-stack/xtl/archive/refs/tags/0.7.7.tar.gz",
         ],
     )
-
-def _bazel_skylib():
-    maybe(
-        http_archive,
-        name = "bazel_skylib",
-        sha256 = "bc283cdfcd526a52c3201279cda4bc298652efa898b10b4db0837dc51652756f",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.7.1/bazel-skylib-1.7.1.tar.gz",
-            "https://github.com/bazelbuild/bazel-skylib/releases/download/1.7.1/bazel-skylib-1.7.1.tar.gz",
-        ],
-    )
-
-def _com_github_openxla_xla():
-    OPENXLA_COMMIT = "64bdcc53a1b24abf19b1fe598e6f9b0fe6454470"
-    OPENXLA_SHA256 = "60918b3a0391fe9e0bd506c9b90170b7b5fa64d06de7ec1f4f0e351a303a88fa"
-
-    # We need openxla to handle xla/mhlo/stablehlo
-    maybe(
-        http_archive,
-        name = "xla",
-        sha256 = OPENXLA_SHA256,
-        strip_prefix = "xla-" + OPENXLA_COMMIT,
-        type = ".tar.gz",
-        urls = [
-            "https://github.com/openxla/xla/archive/{commit}.tar.gz".format(commit = OPENXLA_COMMIT),
-        ],
-        patch_args = ["-p1", "-l"],
-        patches = ["@spulib//bazel:patches/xla-non-hermetic-python.patch"],
-    )
-
-def _com_github_pybind11_bazel():
-    maybe(
-        http_archive,
-        name = "pybind11_bazel",
-        sha256 = "dc4882b23a617575d0fd822aba88aa4a14133c3d428b5a8fb83d81d03444a475",
-        strip_prefix = "pybind11_bazel-8889d39b2b925b2a47519ae09402a96f00ccf2b4",
-        urls = [
-            "https://github.com/pybind/pybind11_bazel/archive/8889d39b2b925b2a47519ae09402a96f00ccf2b4.zip",
-        ],
-    )
-
-def _com_github_pybind11():
-    maybe(
-        http_archive,
-        name = "pybind11",
-        build_file = "@pybind11_bazel//:pybind11.BUILD",
-        sha256 = "e08cb87f4773da97fa7b5f035de8763abc656d87d5773e62f6da0587d1f0ec20",
-        strip_prefix = "pybind11-2.13.6",
-        urls = [
-            "https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.tar.gz",
-        ],
-    )
-
-def _com_intel_hexl():
-    maybe(
-        http_archive,
-        name = "com_intel_hexl",
-        type = "tar.gz",
-        strip_prefix = "hexl-1.2.5",
-        sha256 = "3692e6e6183dbc49253e51e86c3e52e7affcac925f57db0949dbb4d34b558a9a",
-        build_file = "@spulib//bazel:hexl.BUILD",
-        urls = [
-            "https://github.com/intel/hexl/archive/refs/tags/v1.2.5.tar.gz",
-        ],
-        patch_args = ["-p1"],
-        patches = ["@spulib//bazel:patches/hexl.patch"],
-    )
-
-def _com_github_emptoolkit_emp_tool():
-    maybe(
-        http_archive,
-        name = "com_github_emptoolkit_emp_tool",
-        sha256 = "b9ab2380312e78020346b5d2db3d0244c7bd8098cb50f8b3620532ef491808d0",
-        strip_prefix = "emp-tool-0.2.5",
-        type = "tar.gz",
-        patch_args = ["-p1"],
-        patches = [
-            "@spulib//bazel:patches/emp-tool.patch",
-            "@spulib//bazel:patches/emp-tool-cmake.patch",
-            "@spulib//bazel:patches/emp-tool-sse2neon.patch",
-        ],
-        urls = [
-            "https://github.com/emp-toolkit/emp-tool/archive/refs/tags/0.2.5.tar.gz",
-        ],
-        build_file = "@spulib//bazel:emp-tool.BUILD",
-    )
-
-def _com_github_emptoolkit_emp_ot():
-    maybe(
-        http_archive,
-        name = "com_github_emptoolkit_emp_ot",
-        sha256 = "358036e5d18143720ee17103f8172447de23014bcfc1f8e7d5849c525ca928ac",
-        strip_prefix = "emp-ot-0.2.4",
-        type = "tar.gz",
-        patch_args = ["-p1"],
-        patches = ["@spulib//bazel:patches/emp-ot.patch"],
-        urls = [
-            "https://github.com/emp-toolkit/emp-ot/archive/refs/tags/0.2.4.tar.gz",
-        ],
-        build_file = "@spulib//bazel:emp-ot.BUILD",
-    )
-
-def _com_github_eigenteam_eigen():
-    EIGEN_COMMIT = "66e8f38891841bf88ee976a316c0c78a52f0cee5"
-    EIGEN_SHA256 = "01fcd68409c038bbcfd16394274c2bf71e2bb6dda89a2319e23fc59a2da17210"
-    maybe(
-        http_archive,
-        name = "eigen_archive",
-        sha256 = EIGEN_SHA256,
-        build_file = "@spulib//bazel:eigen.BUILD",
-        strip_prefix = "eigen-{commit}".format(commit = EIGEN_COMMIT),
-        urls = [
-            "https://gitlab.com/libeigen/eigen/-/archive/{commit}/eigen-{commit}.tar.gz".format(commit = EIGEN_COMMIT),
-        ],
-    )
-
-def _com_github_nvidia_cutlass():
-    maybe(
-        http_archive,
-        name = "cutlass_archive",
-        strip_prefix = "cutlass-3.6.0",
-        urls = [
-            "https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.6.0.tar.gz",
-        ],
-        sha256 = "7576f3437b90d0de5923560ccecebaa1357e5d72f36c0a59ad77c959c9790010",
-        build_file = "@spulib//bazel:nvidia_cutlass.BUILD",
-    )
diff --git a/bazel/seal.BUILD b/bazel/seal.BUILD
deleted file mode 100644
index 75b22136a..000000000
--- a/bazel/seal.BUILD
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@spulib//bazel:spu.bzl", "spu_cmake_external")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all",
-    srcs = glob(["**"]),
-)
-
-config_setting(
-    name = "can_use_hexl",
-    constraint_values = [
-        "@platforms//cpu:x86_64",
-    ],
-    values = {"compilation_mode": "opt"},
-)
-
-default_config = {
-    "SEAL_USE_MSGSL": "OFF",
-    "SEAL_BUILD_DEPS": "OFF",
-    "SEAL_USE_ZLIB": "OFF",
-    "SEAL_USE_INTEL_HEXL": "OFF",
-    "SEAL_THROW_ON_TRANSPARENT_CIPHERTEXT": "OFF",  #NOTE(juhou) required by apsi
-    "SEAL_USE_ZSTD": "ON",
-    "CMAKE_INSTALL_LIBDIR": "lib",
-}
-
-x64_hexl_config = {
-    "SEAL_USE_MSGSL": "OFF",
-    "SEAL_BUILD_DEPS": "OFF",
-    "SEAL_USE_ZLIB": "OFF",
-    "SEAL_THROW_ON_TRANSPARENT_CIPHERTEXT": "OFF",  #NOTE(juhou) required by apsi
-    "CMAKE_INSTALL_LIBDIR": "lib",
-    "CpuFeatures_DIR": "$EXT_BUILD_DEPS/cpu_features/lib/cmake/CpuFeatures/",
-    "EXT_BUILD_DEPS": "$EXT_BUILD_DEPS",
-    "SEAL_USE_ZSTD": "ON",
-    "SEAL_USE_INTEL_HEXL": "ON",
-}
-
-spu_cmake_external(
-    name = "seal",
-    cache_entries = select({
-        ":can_use_hexl": x64_hexl_config,
-        "//conditions:default": default_config,
-    }),
-    lib_source = "@com_github_microsoft_seal//:all",
-    out_include_dir = "include/SEAL-4.1",
-    out_static_libs = ["libseal-4.1.a"],
-    deps = [
-        "@com_github_facebook_zstd//:zstd",
-    ] + select({
-        "@platforms//cpu:x86_64": ["@com_intel_hexl//:hexl"],
-        "//conditions:default": [],
-    }),
-)
diff --git a/bazel/spu.bzl b/bazel/spu.bzl
index 0ac3fb45c..f0bff2435 100644
--- a/bazel/spu.bzl
+++ b/bazel/spu.bzl
@@ -17,6 +17,8 @@ warpper bazel cc_xx to modify flags.
 """
 
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
+load("@rules_python//python:defs.bzl", "py_binary", "py_library", "py_test")
+load("@spu_pip//:requirements.bzl", pip_dep = "all_requirements")
 load("@yacl//bazel:yacl.bzl", "yacl_cmake_external")
 
 WARNING_FLAGS = [
@@ -59,7 +61,7 @@ def spu_cc_library(
         linkopts = linkopts,
         copts = _spu_copts() + copts,
         deps = deps + [
-            "@com_github_gabime_spdlog//:spdlog",
+            "@spdlog//:spdlog",
         ],
         local_defines = local_defines + [
             "SPU_BUILD",
@@ -96,12 +98,36 @@ def spu_cc_test(
         # -lm for tcmalloc
         linkopts = linkopts + ["-lm"],
         copts = _spu_copts() + copts,
-        deps = deps + [
-            "@com_google_googletest//:gtest_main",
-        ],
+        deps = [
+            "@googletest//:gtest_main",
+        ] + deps,
         local_defines = local_defines + [
             "SPU_BUILD",
         ],
         linkstatic = True,
         **kwargs
     )
+
+def spu_py_binary(
+        deps = [],
+        **kwargs):
+    py_binary(
+        deps = deps + pip_dep,
+        **kwargs
+    )
+
+def spu_py_library(
+        deps = [],
+        **kwargs):
+    py_library(
+        deps = deps + pip_dep,
+        **kwargs
+    )
+
+def spu_py_test(
+        deps = [],
+        **kwargs):
+    py_test(
+        deps = deps + pip_dep,
+        **kwargs
+    )
diff --git a/bazel/xtensor.BUILD b/bazel/xtensor.BUILD
index 7789669d2..f4bf88a25 100644
--- a/bazel/xtensor.BUILD
+++ b/bazel/xtensor.BUILD
@@ -1,4 +1,4 @@
-# Copyright 2022 Ant Group Co., Ltd.
+# Copyright 2024 Ant Group Co., Ltd.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,18 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@spulib//bazel:spu.bzl", "spu_cmake_external")
-
-package(default_visibility = ["//visibility:public"])
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
 
 filegroup(
     name = "all_srcs",
     srcs = glob(["**"]),
 )
 
-spu_cmake_external(
+cmake(
     name = "xtensor",
+    generate_args = ["-GNinja"],
     lib_source = ":all_srcs",
     out_headers_only = True,
-    deps = ["@com_github_xtensor_xtl//:xtl"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@xtl",
+    ],
 )
diff --git a/bazel/zstd.BUILD b/bazel/zstd.BUILD
deleted file mode 100644
index 9a9a09242..000000000
--- a/bazel/zstd.BUILD
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2022 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-load("@spulib//bazel:spu.bzl", "spu_cmake_external")
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "all",
-    srcs = glob(["**"]),
-)
-
-spu_cmake_external(
-    name = "zstd",
-    cache_entries = {
-        "ZSTD_BUILD_PROGRAMS": "OFF",
-        "ZSTD_BUILD_SHARED": "OFF",
-        "ZLIB_BUILD_STATIC": "ON",
-        "ZSTD_BUILD_TESTS": "OFF",
-        "ZSTD_MULTITHREAD_SUPPORT": "OFF",
-        "CMAKE_INSTALL_LIBDIR": "lib",
-    },
-    lib_source = "@com_github_facebook_zstd//:all",
-    out_include_dir = "include/",
-    out_static_libs = ["libzstd.a"],
-    working_directory = "build/cmake",
-)
diff --git a/benchmark/setup_dockers_and_run.sh b/benchmark/setup_dockers_and_run.sh
index e1c8c45e2..bfb9a2f5f 100644
--- a/benchmark/setup_dockers_and_run.sh
+++ b/benchmark/setup_dockers_and_run.sh
@@ -31,7 +31,6 @@ echo -e "${COLOR_GREEN}Build spu-build${COLOR_END}"
 docker run --name spu-build --mount type=bind,source="$(pwd)",target=/home/admin/dev/ secretflow/ubuntu-base-ci:latest \
     sh -c "cd /home/admin/dev && \
             python3 -m pip install -U pip && \
-            python3 -m pip install -r requirements.txt && \
             bazel build //benchmark/... //examples/python/... -c opt --ui_event_filters=-info,-debug,-warning"
 
 docker commit spu-build spu-build:v1
diff --git a/build_wheel_entrypoint.sh b/build_wheel_entrypoint.sh
index 17d70cf44..d133a782b 100755
--- a/build_wheel_entrypoint.sh
+++ b/build_wheel_entrypoint.sh
@@ -15,9 +15,9 @@
 # limitations under the License.
 #
 
-pip install numpy
 
-python setup.py bdist_wheel
+# FIXME: add build option `--config=avx` if building on x86_64 platform
+bazel build //:spu_wheel -c opt
 
 # Ensure binary safety
 if [[ "$OSTYPE" == "linux-gnu"* ]]; then
diff --git a/examples/cpp/BUILD.bazel b/examples/cpp/BUILD.bazel
index b2d27cc9d..8431250e3 100644
--- a/examples/cpp/BUILD.bazel
+++ b/examples/cpp/BUILD.bazel
@@ -31,7 +31,7 @@ spu_cc_binary(
         "//libspu/kernel/hlo:casting",
         "//libspu/kernel/hlo:const",
         "//libspu/kernel/hlo:geometrical",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@llvm-project//llvm:Support",
         "@yacl//yacl/link:factory",
     ],
@@ -43,7 +43,7 @@ spu_cc_library(
     hdrs = ["utils.h"],
     deps = [
         "//libspu/core:context",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@llvm-project//llvm:Support",
         "@yacl//yacl/link:factory",
     ],
diff --git a/examples/python/ml/BUILD.bazel b/examples/python/ml/BUILD.bazel
index 9fcf1b3e7..c06da27d1 100644
--- a/examples/python/ml/BUILD.bazel
+++ b/examples/python/ml/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_test")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -43,5 +44,5 @@ py_test(
         "//examples/python/ml/torch_lr_experiment",
         "//examples/python/ml/torch_resnet_experiment",
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/flax_gpt2/BUILD.bazel b/examples/python/ml/flax_gpt2/BUILD.bazel
index b965e871d..168ea6b3f 100644
--- a/examples/python/ml/flax_gpt2/BUILD.bazel
+++ b/examples/python/ml/flax_gpt2/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -24,5 +25,5 @@ py_binary(
     ],
     deps = [
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/flax_gpt2/README.md b/examples/python/ml/flax_gpt2/README.md
index 7eb25fc4d..652b17b30 100644
--- a/examples/python/ml/flax_gpt2/README.md
+++ b/examples/python/ml/flax_gpt2/README.md
@@ -3,19 +3,13 @@
 This example demonstrates how to use SPU to run private inference on a pre-trained
 [GPT2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) model.
 
-1. Install huggingface transformers library
-
-    ```sh
-    pip install 'transformers[flax]'
-    ```
-
-2. Launch SPU backend runtime
+1. Launch SPU backend runtime
 
     ```sh
     bazel run -c opt //examples/python/utils:nodectl -- --config `pwd`/examples/python/ml/flax_gpt2/3pc.json up
     ```
 
-3. Run `flax_gpt2` example
+2. Run `flax_gpt2` example
 
     ```sh
     bazel run -c opt //examples/python/ml/flax_gpt2 -- --config `pwd`/examples/python/ml/flax_gpt2/3pc.json
diff --git a/examples/python/ml/flax_mlp/BUILD.bazel b/examples/python/ml/flax_mlp/BUILD.bazel
index 56274bd2b..7093ad69c 100644
--- a/examples/python/ml/flax_mlp/BUILD.bazel
+++ b/examples/python/ml/flax_mlp/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -25,5 +26,5 @@ py_binary(
     deps = [
         "//examples/python/utils:dataset_utils",
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/flax_resnet/BUILD.bazel b/examples/python/ml/flax_resnet/BUILD.bazel
index 0f6cfa6e8..70105069b 100644
--- a/examples/python/ml/flax_resnet/BUILD.bazel
+++ b/examples/python/ml/flax_resnet/BUILD.bazel
@@ -12,28 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_binary", "py_library")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
+load("//bazel:spu.bzl", "spu_py_binary")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
-    name = "models",
-    srcs = ["models.py"],
-)
-
-py_binary(
+spu_py_binary(
     name = "flax_resnet_training",
-    srcs = ["flax_resnet_training.py"],
+    srcs = [
+        "flax_resnet_training.py",
+        "models.py",
+    ],
     data = [
         "//examples/python/ml/flax_resnet:3pc.json",
     ],
     deps = [
-        ":models",
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
 
-py_binary(
+spu_py_binary(
     name = "flax_resnet_inference",
     srcs = ["flax_resnet_inference.py"],
     data = [
@@ -41,5 +39,5 @@ py_binary(
     ],
     deps = [
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/flax_resnet/flax_resnet_training.py b/examples/python/ml/flax_resnet/flax_resnet_training.py
index 2946ade5d..df0fcd846 100644
--- a/examples/python/ml/flax_resnet/flax_resnet_training.py
+++ b/examples/python/ml/flax_resnet/flax_resnet_training.py
@@ -28,7 +28,7 @@
 import tensorflow_datasets as tfds
 from flax.training import train_state
 from jax import random
-from models import ResNet18
+from examples.python.ml.flax_resnet.models import ResNet18
 
 NUM_CLASSES = 10
 IMAGE_SIZE = 32
diff --git a/examples/python/ml/flax_t5/BUILD.bazel b/examples/python/ml/flax_t5/BUILD.bazel
index 2855a08a4..8b842c69f 100644
--- a/examples/python/ml/flax_t5/BUILD.bazel
+++ b/examples/python/ml/flax_t5/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -24,5 +25,5 @@ py_binary(
     ],
     deps = [
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/flax_t5/README.md b/examples/python/ml/flax_t5/README.md
index 8399b787d..b20292512 100644
--- a/examples/python/ml/flax_t5/README.md
+++ b/examples/python/ml/flax_t5/README.md
@@ -3,23 +3,17 @@
 This example demonstrates how to use SPU to run private inference on a pre-trained
 [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.FlaxT5ForConditionalGeneration) model.
 
-1. Install huggingface transformers library
-
-    ```sh
-    pip install 'transformers[flax]'
-    ```
-
-2. Enable While with secret value
+1. Enable While with secret value
 
     Edit libspu/kernel/hlo/control_flow.cc, change `ENABLE_DEBUG_ONLY_REVEAL_SECRET_CONDITION` to `true`.
 
-3. Launch SPU backend runtime
+2. Launch SPU backend runtime
 
     ```sh
     bazel run -c opt //examples/python/utils:nodectl -- --config `pwd`/examples/python/ml/flax_t5/3pc.json up
     ```
 
-4. Run `flax_t5` example
+3. Run `flax_t5` example
 
     ```sh
     bazel run -c opt //examples/python/ml/flax_t5 -- --config `pwd`/examples/python/ml/flax_t5/3pc.json
diff --git a/examples/python/ml/flax_vae/BUILD.bazel b/examples/python/ml/flax_vae/BUILD.bazel
index 5585fd246..d269c0416 100644
--- a/examples/python/ml/flax_vae/BUILD.bazel
+++ b/examples/python/ml/flax_vae/BUILD.bazel
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_binary", "py_library")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
+load("//bazel:spu.bzl", "spu_py_binary", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "utils",
     srcs = ["utils.py"],
 )
 
-py_binary(
+spu_py_binary(
     name = "flax_vae",
     srcs = ["flax_vae.py"],
     data = [
@@ -30,5 +31,5 @@ py_binary(
     deps = [
         ":utils",
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/flax_whisper/BUILD.bazel b/examples/python/ml/flax_whisper/BUILD.bazel
index 30bc8d3c0..4c97b86b7 100644
--- a/examples/python/ml/flax_whisper/BUILD.bazel
+++ b/examples/python/ml/flax_whisper/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -24,5 +25,5 @@ py_binary(
     ],
     deps = [
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/flax_whisper/README.md b/examples/python/ml/flax_whisper/README.md
index 1a331f4e7..99fb07c78 100644
--- a/examples/python/ml/flax_whisper/README.md
+++ b/examples/python/ml/flax_whisper/README.md
@@ -3,23 +3,17 @@
 This example demonstrates how to use SPU to run private inference on a pre-trained
 [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.FlaxWhisperForConditionalGeneration) model.
 
-1. Install huggingface transformers library
-
-    ```sh
-    pip install 'transformers[flax]' soundfile librosa
-    ```
-
-2. Enable While with secret value
+1. Enable While with secret value
 
     Edit libspu/kernel/hlo/control_flow.cc, change `ENABLE_DEBUG_ONLY_REVEAL_SECRET_CONDITION` to `true`.
 
-3. Launch SPU backend runtime
+2. Launch SPU backend runtime
 
     ```sh
     bazel run -c opt //examples/python/utils:nodectl -- --config `pwd`/examples/python/ml/flax_whisper/3pc.json up
     ```
 
-4. Run `flax_whisper` example
+3. Run `flax_whisper` example
 
     ```sh
     bazel run -c opt //examples/python/ml/flax_whisper -- --config `pwd`/examples/python/ml/flax_whisper/3pc.json
diff --git a/examples/python/ml/haiku_lstm/BUILD.bazel b/examples/python/ml/haiku_lstm/BUILD.bazel
index fc7a922bb..d6591e01f 100644
--- a/examples/python/ml/haiku_lstm/BUILD.bazel
+++ b/examples/python/ml/haiku_lstm/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -24,5 +25,5 @@ py_binary(
     ],
     deps = [
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/haiku_lstm/README.md b/examples/python/ml/haiku_lstm/README.md
index 97f0990c1..e04cfe6e4 100644
--- a/examples/python/ml/haiku_lstm/README.md
+++ b/examples/python/ml/haiku_lstm/README.md
@@ -6,23 +6,17 @@ This example comes from Haiku official github repo:
 
 <https://github.com/deepmind/dm-haiku/blob/main/examples/haiku_lstms.ipynb>
 
-1. Install dependencies
-
-    ```sh
-    pip install -r ../requirements.txt
-    ```
-
-2. Launch SPU backend runtime
+1. Launch SPU backend runtime
 
     ```sh
     bazel run -c opt //examples/python/utils:nodectl -- up
     ```
 
-3. Run `haiku_lstm` example
+2. Run `haiku_lstm` example
 
     ```sh
     bazel run -c opt //examples/python/ml/haiku_lstm -- --output_dir `pwd`
     ```
 
-4. Check results
+3. Check results
     When training is finished, you can check the generated images in the specified `output_dir` and compare the results to CPU versions.
diff --git a/examples/python/ml/jax_kmeans/BUILD.bazel b/examples/python/ml/jax_kmeans/BUILD.bazel
index d59803c12..0c348b9a0 100644
--- a/examples/python/ml/jax_kmeans/BUILD.bazel
+++ b/examples/python/ml/jax_kmeans/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -24,5 +25,5 @@ py_binary(
     ],
     deps = [
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/jax_lr/BUILD.bazel b/examples/python/ml/jax_lr/BUILD.bazel
index 5c880c45d..591e209c7 100644
--- a/examples/python/ml/jax_lr/BUILD.bazel
+++ b/examples/python/ml/jax_lr/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -26,5 +27,5 @@ py_binary(
         "//examples/python/utils:dataset_utils",
         "//spu:init",
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/jax_svm/BUILD.bazel b/examples/python/ml/jax_svm/BUILD.bazel
index 9fdaba473..4bfca8ace 100644
--- a/examples/python/ml/jax_svm/BUILD.bazel
+++ b/examples/python/ml/jax_svm/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -25,5 +26,5 @@ py_binary(
     deps = [
         "//examples/python/utils:dataset_utils",
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/jraph_gnn/BUILD.bazel b/examples/python/ml/jraph_gnn/BUILD.bazel
index 6db9601ed..dc155a183 100644
--- a/examples/python/ml/jraph_gnn/BUILD.bazel
+++ b/examples/python/ml/jraph_gnn/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -24,5 +25,5 @@ py_binary(
     ],
     deps = [
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/jraph_gnn/README.md b/examples/python/ml/jraph_gnn/README.md
index 328838880..cc074372f 100644
--- a/examples/python/ml/jraph_gnn/README.md
+++ b/examples/python/ml/jraph_gnn/README.md
@@ -6,13 +6,7 @@ This example comes from Jraph official github repo:
 
 <https://github.com/deepmind/jraph/blob/master/jraph/examples/zacharys_karate_club.py>
 
-1. Install dependencies
-
-    ```sh
-    pip install -r ../requirements.txt
-    ```
-
-2. Set runtime configuration
+1. Set runtime configuration
 
     This example requires a higher precision setting than the default.
 
@@ -20,13 +14,13 @@ This example comes from Jraph official github repo:
 
     The default configuration file locates at [examples/python/conf/3pc.json](../../conf/3pc.json).
 
-3. Launch SPU backend runtime
+2. Launch SPU backend runtime
 
     ```sh
     bazel run -c opt //examples/python/utils:nodectl -- up
     ```
 
-4. Run `jraph_gnn` example
+3. Run `jraph_gnn` example
 
     ```sh
     bazel run -c opt //examples/python/ml/jraph_gnn
diff --git a/examples/python/ml/requirements.txt b/examples/python/ml/requirements.txt
deleted file mode 100644
index 60befe345..000000000
--- a/examples/python/ml/requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-dm-haiku
-plotnine
-jraph
-optax
-torch==2.3.0
-torch_xla==2.3.0
-torchvision
-jax[cpu]
-tensorflow_datasets
-keras
diff --git a/examples/python/ml/ss_lr/BUILD.bazel b/examples/python/ml/ss_lr/BUILD.bazel
index caa50155b..413710f13 100644
--- a/examples/python/ml/ss_lr/BUILD.bazel
+++ b/examples/python/ml/ss_lr/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -26,5 +27,5 @@ py_binary(
         "//examples/python/utils:appr_sigmoid",
         "//examples/python/utils:dataset_utils",
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/ss_xgb/BUILD.bazel b/examples/python/ml/ss_xgb/BUILD.bazel
index 34168304b..3e9c0f8df 100644
--- a/examples/python/ml/ss_xgb/BUILD.bazel
+++ b/examples/python/ml/ss_xgb/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -26,5 +27,5 @@ py_binary(
         "//examples/python/utils:appr_sigmoid",
         "//examples/python/utils:dataset_utils",
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/stax_mnist_classifier/BUILD.bazel b/examples/python/ml/stax_mnist_classifier/BUILD.bazel
index 2c7f6da86..e0e5d5bc1 100644
--- a/examples/python/ml/stax_mnist_classifier/BUILD.bazel
+++ b/examples/python/ml/stax_mnist_classifier/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -25,5 +26,5 @@ py_binary(
     deps = [
         "//examples/python/utils:dataset_utils",
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/stax_nn/BUILD.bazel b/examples/python/ml/stax_nn/BUILD.bazel
index 2318515fd..0a7484b0b 100644
--- a/examples/python/ml/stax_nn/BUILD.bazel
+++ b/examples/python/ml/stax_nn/BUILD.bazel
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
+load("//bazel:spu.bzl", "spu_py_binary", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "models",
     srcs = ["models.py"],
 )
 
-py_binary(
+spu_py_binary(
     name = "stax_nn",
     srcs = ["stax_nn.py"],
     data = [
@@ -32,5 +33,5 @@ py_binary(
         "//examples/python/utils:dataset_utils",
         "//examples/python/utils:optimizers",
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/tf_experiment/BUILD.bazel b/examples/python/ml/tf_experiment/BUILD.bazel
index 8bcce8c09..cfe836b6f 100644
--- a/examples/python/ml/tf_experiment/BUILD.bazel
+++ b/examples/python/ml/tf_experiment/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -24,5 +25,5 @@ py_binary(
     ],
     deps = [
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/torch_lr_experiment/BUILD.bazel b/examples/python/ml/torch_lr_experiment/BUILD.bazel
index 36cdcce16..f85eb654e 100644
--- a/examples/python/ml/torch_lr_experiment/BUILD.bazel
+++ b/examples/python/ml/torch_lr_experiment/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -24,5 +25,5 @@ py_binary(
     ],
     deps = [
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/torch_lr_experiment/README.md b/examples/python/ml/torch_lr_experiment/README.md
index 57754ed97..04d5f5d1f 100644
--- a/examples/python/ml/torch_lr_experiment/README.md
+++ b/examples/python/ml/torch_lr_experiment/README.md
@@ -4,19 +4,13 @@ This example demonstrates how to use SPU to make private inferences on PyTorch m
 
 **Note**: Currently, SPU's support of PyTorch is **experimental**.
 
-1. Install a third-party dependency [PyTorch/XLA](https://github.com/pytorch/xla).
-
-    ```sh
-    pip install torch==2.3.0 torch_xla==2.3.0
-    ```
-
-2. Launch SPU backend runtime
+1. Launch SPU backend runtime
 
     ```sh
     bazel run -c opt //examples/python/utils:nodectl -- up
     ```
 
-3. Run `torch_lr_experiment` example
+2. Run `torch_lr_experiment` example
 
     ```sh
     bazel run -c opt //examples/python/ml/torch_lr_experiment
diff --git a/examples/python/ml/torch_resnet_experiment/BUILD.bazel b/examples/python/ml/torch_resnet_experiment/BUILD.bazel
index 91d89e456..87a6033e1 100644
--- a/examples/python/ml/torch_resnet_experiment/BUILD.bazel
+++ b/examples/python/ml/torch_resnet_experiment/BUILD.bazel
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 load("@rules_python//python:defs.bzl", "py_binary")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -24,5 +25,5 @@ py_binary(
     ],
     deps = [
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/examples/python/ml/torch_resnet_experiment/README.md b/examples/python/ml/torch_resnet_experiment/README.md
index af2a04aa5..df902d047 100644
--- a/examples/python/ml/torch_resnet_experiment/README.md
+++ b/examples/python/ml/torch_resnet_experiment/README.md
@@ -4,20 +4,13 @@ This example demonstrates how to use SPU to make private inferences on PyTorch m
 
 **Note**: Currently, SPU's support of PyTorch is **experimental**.
 
-1. Install a third-party dependency [PyTorch/XLA](https://github.com/pytorch/xla).
-
-    ```sh
-    pip install torch==2.3.0 torch_xla==2.3.0
-    pip install torchvision
-    ```
-
-2. Launch SPU backend runtime
+1. Launch SPU backend runtime
 
     ```sh
     bazel run -c opt //examples/python/utils:nodectl -- up
     ```
 
-3. Run `torch_resnet_experiment` example
+2. Run `torch_resnet_experiment` example
 
     ```sh
     bazel run -c opt //examples/python/ml/torch_resnet_experiment
diff --git a/examples/python/utils/BUILD.bazel b/examples/python/utils/BUILD.bazel
index 9e5fa2d58..aea58934d 100644
--- a/examples/python/utils/BUILD.bazel
+++ b/examples/python/utils/BUILD.bazel
@@ -12,33 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("@spu_pip_dev//:requirements.bzl", "all_requirements")
+load("//bazel:spu.bzl", "spu_py_binary", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "appr_sigmoid",
     srcs = ["appr_sigmoid.py"],
 )
 
-py_library(
+spu_py_library(
     name = "dataset_utils",
     srcs = ["dataset_utils.py"],
 )
 
-py_library(
+spu_py_library(
     name = "stax_utils",
     srcs = ["stax_utils.py"],
     deps = [
     ],
 )
 
-py_library(
+spu_py_library(
     name = "optimizers",
     srcs = ["optimizers.py"],
 )
 
-py_binary(
+spu_py_binary(
     name = "nodectl",
     srcs = ["nodectl.py"],
     data = [
@@ -47,5 +48,5 @@ py_binary(
     deps = [
         ":dataset_utils",  # server dependent.
         "//spu/utils:distributed",
-    ],
+    ] + all_requirements,
 )
diff --git a/experimental/squirrel/BUILD.bazel b/experimental/squirrel/BUILD.bazel
index 64c794a5c..518fc4ef9 100644
--- a/experimental/squirrel/BUILD.bazel
+++ b/experimental/squirrel/BUILD.bazel
@@ -60,8 +60,8 @@ spu_cc_library(
         "//libspu/mpc/cheetah/rlwe:cheetah_rlwe",
         "//libspu/mpc/cheetah/rlwe:lwe",
         "//libspu/mpc/cheetah/rlwe:packlwes",
-        "@com_github_microsoft_seal//:seal",
-        "@eigen_archive//:eigen3",
+        "@eigen",
+        "@seal",
         "@yacl//yacl/utils:elapsed_timer",
     ],
 )
@@ -139,7 +139,7 @@ spu_cc_binary(
         ":tree_build_worker",
         ":tree_builder",
         "//libspu/device:io",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
         "@llvm-project//llvm:Support",
         "@yacl//yacl/link:factory",
     ],
diff --git a/libspu/BUILD.bazel b/libspu/BUILD.bazel
index 2b0fcf3f8..8d413c454 100644
--- a/libspu/BUILD.bazel
+++ b/libspu/BUILD.bazel
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+load("@protobuf//bazel:py_proto_library.bzl", "py_proto_library")
 load("@rules_cc//cc:defs.bzl", "cc_proto_library")
 load("@rules_proto//proto:defs.bzl", "proto_library")
-load("@rules_proto_grpc//python:defs.bzl", "python_proto_compile")
+load("//:version.bzl", "spu_version_gen")
 load("//bazel:spu.bzl", "spu_cc_library")
 
 package(default_visibility = ["//visibility:public"])
@@ -29,14 +30,18 @@ cc_proto_library(
     deps = [":spu_proto"],
 )
 
-python_proto_compile(
+py_proto_library(
     name = "spu_py_proto",
-    output_mode = "NO_PREFIX",
-    prefix_path = "..",
-    protos = ["//libspu:spu_proto"],
+    deps = ["//libspu:spu_proto"],
 )
 
 spu_cc_library(
     name = "version",
-    hdrs = ["version.h"],
+    hdrs = [":spu_version"],
+)
+
+spu_version_gen(
+    name = "spu_version",
+    out = "version.h",
+    template = "version.h.in",
 )
diff --git a/libspu/compiler/common/BUILD.bazel b/libspu/compiler/common/BUILD.bazel
index eef7efaf2..8d64134c3 100644
--- a/libspu/compiler/common/BUILD.bazel
+++ b/libspu/compiler/common/BUILD.bazel
@@ -22,7 +22,7 @@ spu_cc_library(
     hdrs = ["ir_printer_config.h"],
     visibility = ["//visibility:private"],
     deps = [
-        "@com_github_fmtlib_fmt//:fmtlib",
+        "@fmt",
         "@llvm-project//mlir:Pass",
     ],
 )
diff --git a/libspu/compiler/tests/BUILD.bazel b/libspu/compiler/tests/BUILD.bazel
index 2dd061272..28c3224e2 100644
--- a/libspu/compiler/tests/BUILD.bazel
+++ b/libspu/compiler/tests/BUILD.bazel
@@ -23,8 +23,8 @@ expand_template(
     substitutions = {
         "@LIT_SITE_CFG_IN_HEADER@": "# Autogenerated, do not edit.",
         "@LLVM_TOOLS_DIR@": package_path("@llvm-project//llvm:BUILD"),
-        "\"@PPHLO_TOOLS_DIR@\"": "os.path.join(os.environ['TEST_SRCDIR'], 'spulib', 'libspu', 'compiler', 'tools')",
-        "\"@PPHLO_SOURCE_DIR@\"": "os.path.join(os.environ['TEST_SRCDIR'], 'spulib')",
+        "\"@PPHLO_TOOLS_DIR@\"": "os.path.join(os.environ['TEST_SRCDIR'], '_main', 'libspu', 'compiler', 'tools')",
+        "\"@PPHLO_SOURCE_DIR@\"": "os.path.join(os.environ['TEST_SRCDIR'], '_main')",
     },
     template = "lit.site.cfg.py.in",
 )
diff --git a/libspu/compiler/tests/passes/optimizations/sort_lowering.mlir b/libspu/compiler/tests/passes/optimizations/sort_lowering.mlir
index 37d98fd1b..ef9e8b790 100644
--- a/libspu/compiler/tests/passes/optimizations/sort_lowering.mlir
+++ b/libspu/compiler/tests/passes/optimizations/sort_lowering.mlir
@@ -21,3 +21,19 @@ func.func @main(%arg0: tensor<10x!pphlo.secret<f32>>, %arg1: tensor<10x!pphlo.se
     }) {dimension = 0 : i64, is_stable = false} : (tensor<10x!pphlo.secret<f32>>, tensor<10x!pphlo.secret<f32>>) -> (tensor<10x!pphlo.secret<f32>>, tensor<10x!pphlo.secret<f32>>)
     return %0#0, %0#1 : tensor<10x!pphlo.secret<f32>>, tensor<10x!pphlo.secret<f32>>
   }
+
+// -----
+
+func.func @main(%arg0: tensor<3x4x!pphlo.secret<f32>>, %arg1: tensor<3x4x!pphlo.secret<f32>>, %arg2: tensor<3x4x!pphlo.secret<f32>>) -> (tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>) {
+    // CHECK: %0:3 = pphlo.simple_sort %arg0, %arg1, %arg2  ASC, dim = 1, num_keys = 2 : (tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>) -> (tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>)
+    %0:3 = "pphlo.sort"(%arg0, %arg1, %arg2) ({
+    ^bb0(%arg3: tensor<!pphlo.secret<f32>>, %arg4: tensor<!pphlo.secret<f32>>, %arg5: tensor<!pphlo.secret<f32>>, %arg6: tensor<!pphlo.secret<f32>>, %arg7: tensor<!pphlo.secret<f32>>, %arg8: tensor<!pphlo.secret<f32>>):
+      %1 = pphlo.less %arg3, %arg4 : (tensor<!pphlo.secret<f32>>, tensor<!pphlo.secret<f32>>) -> tensor<!pphlo.secret<i1>>
+      %2 = pphlo.equal %arg3, %arg4 : (tensor<!pphlo.secret<f32>>, tensor<!pphlo.secret<f32>>) -> tensor<!pphlo.secret<i1>>
+      %3 = pphlo.less %arg5, %arg6 : (tensor<!pphlo.secret<f32>>, tensor<!pphlo.secret<f32>>) -> tensor<!pphlo.secret<i1>>
+      %4 = pphlo.and %2, %3 : tensor<!pphlo.secret<i1>>
+      %5 = pphlo.or %1, %4 : tensor<!pphlo.secret<i1>>
+      pphlo.return %5 : tensor<!pphlo.secret<i1>>
+    }) {dimension = 1 : i64, is_stable = true} : (tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>) -> (tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>)
+    return %0#0, %0#1, %0#2 : tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>, tensor<3x4x!pphlo.secret<f32>>
+  }
diff --git a/libspu/core/BUILD.bazel b/libspu/core/BUILD.bazel
index 1f4278493..b5c95acfd 100644
--- a/libspu/core/BUILD.bazel
+++ b/libspu/core/BUILD.bazel
@@ -175,7 +175,7 @@ spu_cc_library(
         ":ndarray_ref",
         ":pt_buffer_view",
         ":shape",
-        "@com_github_xtensor_xtensor//:xtensor",
+        "@xtensor",
     ],
 )
 
@@ -218,9 +218,9 @@ spu_cc_library(
     hdrs = ["logging.h"],
     deps = [
         "//libspu/core:prelude",
-        "@com_github_brpc_brpc//:butil",
-        "@com_github_fmtlib_fmt//:fmtlib",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
+        "@brpc//:butil",
+        "@fmt",
         "@yacl//yacl/link:trace",
     ],
 )
@@ -230,7 +230,7 @@ spu_cc_library(
     srcs = ["bit_utils.cc"],
     hdrs = ["bit_utils.h"],
     deps = [
-        "@com_google_absl//absl/numeric:bits",
+        "@abseil-cpp//absl/numeric:bits",
         "@yacl//yacl/base:int128",
         "@yacl//yacl/utils:platform_utils",
     ],
@@ -251,7 +251,7 @@ spu_cc_binary(
     linkopts = ["-lm"],
     deps = [
         ":bit_utils",
-        "@com_github_google_benchmark//:benchmark_main",
+        "@google_benchmark//:benchmark_main",
     ],
 )
 
diff --git a/libspu/core/config.cc b/libspu/core/config.cc
index 81ca2f9d4..c40c4c3e1 100644
--- a/libspu/core/config.cc
+++ b/libspu/core/config.cc
@@ -57,6 +57,11 @@ void populateRuntimeConfig(RuntimeConfig& cfg) {
     cfg.set_fxp_div_goldschmidt_iters(2);
   }
 
+  // sort
+  if (cfg.quick_sort_threshold() == 0) {
+    cfg.set_quick_sort_threshold(32);
+  }
+
   // fxp exponent config
   {
     if (cfg.fxp_exp_mode() == RuntimeConfig::EXP_DEFAULT) {
diff --git a/libspu/core/encoding.cc b/libspu/core/encoding.cc
index 98a17a1ac..a92c181d0 100644
--- a/libspu/core/encoding.cc
+++ b/libspu/core/encoding.cc
@@ -127,13 +127,14 @@ NdArrayRef encodeToRing(const PtBufferView& bv, FieldType field,
 void decodeFromRing(const NdArrayRef& src, DataType in_dtype, size_t fxp_bits,
                     PtBufferView* out_bv, PtType* out_pt_type) {
   const Type& src_type = src.eltype();
+
+  SPU_ENFORCE(src_type.isa<Ring2k>(), "source must be ring2k, got={}",
+              src_type);
+
   const FieldType field = src_type.as<Ring2k>()->field();
   const PtType pt_type = getDecodeType(in_dtype);
   const size_t numel = src.numel();
 
-  SPU_ENFORCE(src_type.isa<RingTy>(), "source must be ring_type, got={}",
-              src_type);
-
   if (out_pt_type != nullptr) {
     *out_pt_type = pt_type;
   }
diff --git a/libspu/core/prelude.h b/libspu/core/prelude.h
index d9726cbed..cfb6ce1ce 100644
--- a/libspu/core/prelude.h
+++ b/libspu/core/prelude.h
@@ -92,4 +92,7 @@ struct formatter<spu::RuntimeConfig_SigmoidMode> : ostream_formatter {};
 template <>
 struct formatter<spu::SourceIRType> : ostream_formatter {};
 
+template <>
+struct formatter<spu::RuntimeConfig_SortMethod> : ostream_formatter {};
+
 }  // namespace fmt
diff --git a/libspu/core/value.h b/libspu/core/value.h
index 75c10f78d..1eccd384d 100644
--- a/libspu/core/value.h
+++ b/libspu/core/value.h
@@ -67,6 +67,13 @@ class Value final {
   bool isPublic() const { return vtype() == VIS_PUBLIC; }
   bool isSecret() const { return vtype() == VIS_SECRET; }
   bool isPrivate() const { return vtype() == VIS_PRIVATE; }
+  int64_t owner() const {
+    if (isPrivate()) {
+      return storage_type().as<Private>()->owner();
+    } else {
+      return -1;
+    }
+  }
 
   // Get dtype.
   DataType dtype() const { return dtype_; }
diff --git a/libspu/cuda_support/BUILD.bazel b/libspu/cuda_support/BUILD.bazel
index f85ccc36c..eb8430943 100644
--- a/libspu/cuda_support/BUILD.bazel
+++ b/libspu/cuda_support/BUILD.bazel
@@ -25,7 +25,7 @@ cuda_library(
         "manual",  # Exclude this target from :all expansion
     ],
     deps = [
-        "@com_github_nvidia_cutlass//:cutlass",
+        "@cutlass",
     ],
 )
 
diff --git a/libspu/dialect/pphlo/transforms/sort_lowering.cc b/libspu/dialect/pphlo/transforms/sort_lowering.cc
index 3640e8ad5..6660a8b40 100644
--- a/libspu/dialect/pphlo/transforms/sort_lowering.cc
+++ b/libspu/dialect/pphlo/transforms/sort_lowering.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -69,6 +70,99 @@ struct SortConversion : public OpRewritePattern<SortOp> {
         return success();
       }
     }
+
+    // pattern for jax.lax.sort lowering
+    if (comp.hasOneBlock()) {
+      auto &first_inst = comp.front().front();
+      bool match_less = matchPattern(&first_inst, m_Op<pphlo::LessOp>());
+      bool match_greater = matchPattern(&first_inst, m_Op<pphlo::GreaterOp>());
+      if (match_less || match_greater) {
+        SortDirectionAttr direction;
+
+        if (match_greater) {
+          // descent
+          direction =
+              SortDirectionAttr::get(op->getContext(), SortDirection::DES);
+        } else {
+          // ascent
+          direction =
+              SortDirectionAttr::get(op->getContext(), SortDirection::ASC);
+        }
+
+        size_t key_nums = 0;
+        const auto comp_name = first_inst.getName().getStringRef();
+        // save the result for each instruction for following check.
+        std::vector<mlir::Value> results;
+        for (auto &instr : comp.front().without_terminator()) {
+          if (matchPattern(&instr, m_Op(comp_name))) {
+            key_nums++;
+          }
+          results.push_back(instr.getResult(0));
+        }
+
+        // idx of and/or blocks
+        size_t lhs_idx = 2 * key_nums - 3;
+        size_t rhs_idx = 2 * key_nums - 2;
+        for (auto [i, instr] :
+             llvm::enumerate(comp.front().without_terminator())) {
+          if (i <= 2 * key_nums - 2) {
+            auto lhs_arg =
+                mlir::dyn_cast<mlir::BlockArgument>(instr.getOperand(0));
+            auto rhs_arg =
+                mlir::dyn_cast<mlir::BlockArgument>(instr.getOperand(1));
+
+            if (lhs_arg == nullptr || rhs_arg == nullptr) {
+              return failure();
+            }
+
+            auto lhs_idx = lhs_arg.getArgNumber();
+            auto rhs_idx = rhs_arg.getArgNumber();
+
+            // less + equal blocks
+            if ((i & 1) == 0 && matchPattern(&instr, m_Op(comp_name))) {
+              if (lhs_idx != i || rhs_idx != (i + 1)) {
+                return failure();
+              }
+            }
+            // equal op
+            if ((i & 1) == 1 && matchPattern(&instr, m_Op<pphlo::EqualOp>())) {
+              if (lhs_idx != (i - 1) || rhs_idx != i) {
+                return failure();
+              }
+            }
+          } else {
+            // check the operands of and/or
+            auto lhs = instr.getOperand(0);
+            auto rhs = instr.getOperand(1);
+            bool pass = (lhs == results[lhs_idx] && rhs == results[rhs_idx]) ||
+                        (lhs == results[rhs_idx] && rhs == results[lhs_idx]);
+
+            // and blocks
+            if ((i & 1) == 1 && matchPattern(&instr, m_Op<pphlo::AndOp>())) {
+              if (!pass) {
+                return failure();
+              }
+            }
+
+            // or blocks
+            if ((i & 1) == 0 && matchPattern(&instr, m_Op<pphlo::OrOp>())) {
+              if (!pass) {
+                return failure();
+              }
+            }
+
+            lhs_idx--;
+            rhs_idx++;
+          }
+        }
+
+        rewriter.replaceOpWithNewOp<pphlo::SimpleSortOp>(
+            op, op.getResultTypes(), op.getOperands(), op.getDimensionAttr(),
+            rewriter.getI64IntegerAttr(key_nums), direction);
+        return success();
+      }
+    }
+
     return failure();
   }
 };
diff --git a/libspu/kernel/BUILD.bazel b/libspu/kernel/BUILD.bazel
index d588fa53a..f48355aee 100644
--- a/libspu/kernel/BUILD.bazel
+++ b/libspu/kernel/BUILD.bazel
@@ -27,6 +27,6 @@ spu_cc_library(
         "//libspu/kernel/hal:prot_wrapper",  # BAD
         "//libspu/kernel/hal:public_helper",  # BAD
         "//libspu/mpc:factory",
-        "@com_google_googletest//:gtest",
+        "@googletest//:gtest",
     ],
 )
diff --git a/libspu/kernel/hal/BUILD.bazel b/libspu/kernel/hal/BUILD.bazel
index b5aeef415..258bfe80c 100644
--- a/libspu/kernel/hal/BUILD.bazel
+++ b/libspu/kernel/hal/BUILD.bazel
@@ -305,3 +305,12 @@ spu_cc_test(
         "//libspu/kernel:test_util",
     ],
 )
+
+spu_cc_library(
+    name = "soprf",
+    srcs = ["soprf.cc"],
+    hdrs = ["soprf.h"],
+    deps = [
+        ":ring",
+    ],
+)
diff --git a/libspu/kernel/hal/permute.cc b/libspu/kernel/hal/permute.cc
index b7c34ff20..d40115766 100644
--- a/libspu/kernel/hal/permute.cc
+++ b/libspu/kernel/hal/permute.cc
@@ -44,6 +44,129 @@ inline bool _has_same_owner(const Value &x, const Value &y) {
   return _get_owner(x) == _get_owner(y);
 }
 
+hal::CompFn _get_cmp_func(SPUContext *ctx, int64_t num_keys,
+                          SortDirection direction, bool append_rand = false) {
+  hal::CompFn comp_fn = [ctx, num_keys, direction, append_rand](
+                            absl::Span<const spu::Value> values) -> spu::Value {
+    auto scalar_cmp = [direction](spu::SPUContext *ctx, const spu::Value &lhs,
+                                  const spu::Value &rhs) {
+      if (direction == SortDirection::Ascending) {
+        return hal::less(ctx, lhs, rhs);
+      }
+      return hal::greater(ctx, lhs, rhs);
+    };
+
+    spu::Value k1 = hal::constant(ctx, true, DT_I1, values[0].shape());
+    spu::Value pre_equal = k1;
+    spu::Value result = scalar_cmp(ctx, values[0], values[1]);
+    // the idea here is that if the two values of the last key is equal,
+    // than we compare the two values of the current key, and iteratively to
+    // update the result which indicates whether to swap values
+    int64_t idx;
+    for (idx = 2; idx < num_keys * 2; idx += 2) {
+      pre_equal = hal::bitwise_and(
+          ctx, pre_equal, hal::equal(ctx, values[idx - 2], values[idx - 1]));
+      auto current = scalar_cmp(ctx, values[idx], values[idx + 1]);
+      current = hal::bitwise_and(ctx, pre_equal, current);
+      result = hal::bitwise_or(ctx, result, current);
+    }
+
+    // append rand value to avoid the same key "pitfall" in partition-based
+    // algorithms (e.g. quick-sort, quick-select).
+    if (append_rand) {
+      // must use secret bits here, otherwise some infos will leak
+      auto rand_bits = hal::random(ctx, VIS_SECRET, DT_I1, values[0].shape());
+
+      // equal has better performance for aby3
+      // cmp+andbb has better performance for semi2k now
+      pre_equal = hal::bitwise_and(
+          ctx, pre_equal, hal::equal(ctx, values[idx - 2], values[idx - 1]));
+      auto current = hal::bitwise_and(ctx, pre_equal, rand_bits);
+      result = hal::bitwise_or(ctx, result, current);
+    }
+
+    return result;
+  };
+
+  return comp_fn;
+}
+
+bool _check_method_require(SPUContext *ctx, RuntimeConfig::SortMethod method) {
+  bool pass = false;
+  switch (method) {
+    case RuntimeConfig::SORT_RADIX:
+      pass = ctx->hasKernel("rand_perm_m") && ctx->hasKernel("perm_am") &&
+             ctx->hasKernel("perm_ap") && ctx->hasKernel("inv_perm_am") &&
+             ctx->hasKernel("inv_perm_ap");
+      break;
+    case RuntimeConfig::SORT_QUICK:
+      // quick sort only requires small subsets of shuffle kernels, but need
+      // rand_b kernel to avoid calling of a2b.
+      pass = ctx->hasKernel("rand_perm_m") && ctx->hasKernel("perm_am") &&
+             ctx->hasKernel("rand_b");
+      break;
+    case RuntimeConfig::SORT_NETWORK:
+      // sort network is a general method which can be used for all MPC
+      // protocols.
+      pass = true;
+      break;
+    default:
+      SPU_THROW("Should not reach here");
+  }
+
+  return pass;
+}
+
+RuntimeConfig::SortMethod select_sort_method(
+    SPUContext *ctx, RuntimeConfig::SortMethod preferred_method) {
+  SPU_ENFORCE(preferred_method != RuntimeConfig::SORT_DEFAULT);
+
+  // if the preferred method is not supported, fall back to sorting network now.
+  const RuntimeConfig::SortMethod fallback_method = RuntimeConfig::SORT_NETWORK;
+
+  switch (preferred_method) {
+    case RuntimeConfig::SORT_RADIX:
+      if (internal::_check_method_require(ctx, RuntimeConfig::SORT_RADIX)) {
+        return preferred_method;
+      }
+      break;
+
+    case RuntimeConfig::SORT_QUICK:
+      if (internal::_check_method_require(ctx, RuntimeConfig::SORT_QUICK)) {
+        return preferred_method;
+      }
+      break;
+
+    case RuntimeConfig::SORT_NETWORK:
+      // always true now.
+      if (internal::_check_method_require(ctx, RuntimeConfig::SORT_NETWORK)) {
+        return preferred_method;
+      }
+      SPU_THROW("should not reach here");
+      break;
+
+    default:
+      SPU_THROW("should not reach here");
+  }
+
+  return fallback_method;
+}
+
+std::vector<spu::Value> fallback_sort1d(SPUContext *ctx,
+                                        absl::Span<spu::Value const> inputs,
+                                        int64_t num_keys,
+                                        SortDirection direction) {
+  auto comp_fn = _get_cmp_func(ctx, num_keys, direction);
+  Visibility vis = std::all_of(inputs.begin(), inputs.begin() + num_keys,
+                               [](const spu::Value &v) { return v.isPublic(); })
+                       ? VIS_PUBLIC
+                       : VIS_SECRET;
+  // currently, general sort1d only supports odd-even sorting network which is
+  // an unstable sort method.
+  auto ret = sort1d(ctx, inputs, comp_fn, vis, false);
+  return ret;
+}
+
 void _hint_nbits(const Value &a, size_t nbits) {
   if (a.storage_type().isa<BShare>()) {
     const_cast<Type &>(a.storage_type()).as<BShare>()->setNbits(nbits);
@@ -218,6 +341,290 @@ void HandleSmallArray(SPUContext *ctx, const CompFn &comparator_body,
   }
 }
 
+std::vector<Value> _construct_cmp_values(
+    SPUContext *ctx, const std::vector<std::pair<int64_t, int64_t>> &intervals,
+    absl::Span<spu::Value const> arr, const int64_t quick_sort_thres,
+    const int64_t num_keys) {
+  int64_t lo;
+  int64_t hi;
+  int64_t left;
+  int64_t right;
+
+  std::vector<std::vector<Value>> cmp_values(2 * num_keys);
+  for (auto &values : cmp_values) {
+    values.reserve(intervals.size());
+  }
+
+  for (const auto &interval : intervals) {
+    std::tie(lo, hi) = interval;
+
+    if (hi - lo <= quick_sort_thres) {
+      continue;
+    }
+
+    left = lo + 1;
+    right = hi;
+
+    for (int64_t i = 0; i < num_keys; i++) {
+      // pivot
+      cmp_values[2 * i].push_back(broadcast_to(
+          ctx, slice_scalar_at(ctx, arr[i], {lo}), {right - left + 1}));
+      // others
+      cmp_values[2 * i + 1].push_back(slice(ctx, arr[i], {left}, {right + 1}));
+    }
+  }
+
+  // no need to quick sort
+  if (cmp_values[0].empty()) {
+    return {};
+  }
+
+  std::vector<Value> ret;
+  ret.reserve(2 * num_keys);
+
+  for (int64_t i = 0; i < 2 * num_keys; i++) {
+    ret.push_back(concatenate(ctx, cmp_values[i], 0));
+  }
+
+  return ret;
+}
+
+bool Partition(SPUContext *ctx, const int64_t num_keys,
+               const CompFn &comparator_body, absl::Span<spu::Value> arr,
+               std::vector<std::pair<int64_t, int64_t>> &intervals) {
+  if (intervals.empty()) {
+    return false;
+  }
+
+  int64_t quick_sort_thres = ctx->config().quick_sort_threshold();
+
+  int64_t lo;  // left end of current interval
+  int64_t hi;  // right end of current interval
+
+  int64_t left;   // location of left pointer
+  int64_t right;  // location of right pointer
+  int64_t mid;    // location of pivot element after partition
+
+  auto values =
+      _construct_cmp_values(ctx, intervals, arr, quick_sort_thres, num_keys);
+
+  if (values.empty()) {
+    return false;
+  }
+
+  auto predicate = comparator_body(values);
+  auto _predicate = dump_public_as<bool>(ctx, hal::reveal(ctx, predicate));
+
+  Index lhs_indices;
+  Index rhs_indices;
+  Index pivot_indices;
+  Index mid_indices;
+  // save partition output, i.e. (lo, mid, hi), where mid is the location of
+  // pivot after partition.
+  std::vector<std::tuple<int64_t, int64_t, int64_t>> pos;
+  // save the intervals that do not need quick sort anymore.
+  std::vector<std::pair<int64_t, int64_t>> pass_vec;
+
+  int64_t length = 0;
+  for (auto item : intervals) {
+    std::tie(lo, hi) = item;
+
+    if (hi - lo <= quick_sort_thres) {
+      pass_vec.emplace_back(lo, hi);
+      continue;
+    }
+
+    left = lo + 1;
+    right = hi;
+
+    auto offset = left;
+    // use two pointer for partition
+    for (;;) {
+      while (right >= left && !_predicate[left - offset + length]) {
+        left++;
+      }
+      while (right >= left && _predicate[right - offset + length]) {
+        right--;
+      }
+      if (right < left) {
+        break;
+      }
+
+      lhs_indices.emplace_back(left);
+      rhs_indices.emplace_back(right);
+
+      left++;
+      right--;
+    }
+    length += (hi - lo);
+
+    pivot_indices.emplace_back(lo);
+    mid_indices.emplace_back(right);
+    pos.emplace_back(lo, right, hi);
+  }
+  Swap(arr, lhs_indices, rhs_indices);
+  // swap the pivot
+  Swap(arr, pivot_indices, mid_indices);
+
+  intervals.swap(pass_vec);
+  intervals.reserve(2 * intervals.size());
+
+  while (!pos.empty()) {
+    std::tie(lo, mid, hi) = pos.back();
+    pos.pop_back();
+    if (lo < mid) {
+      intervals.emplace_back(lo, mid - 1);
+    }
+    if (mid < hi) {
+      intervals.emplace_back(mid + 1, hi);
+    }
+  }
+
+  return true;
+}
+
+// this algorithm is mainly adopted from odd-even mergesort, but we can reveal
+// the comparison because of shuffling
+void mergesort(SPUContext *ctx, const CompFn &comparator_body,
+               absl::Span<spu::Value> arr,
+               std::vector<std::pair<int64_t, int64_t>> &intervals) {
+  const auto N = arr.front().numel();
+  int64_t logn = Log2Ceil(N);
+  // max depth for odd-even merge network
+  int64_t depth = ((logn + 1) * logn) / 2;
+
+  std::vector<Index> lhs_indices(depth);
+  std::vector<Index> rhs_indices(depth);
+
+  int64_t lo;
+  int64_t hi;
+  for (auto item : intervals) {
+    std::tie(lo, hi) = item;
+    if (hi - lo <= 0) {
+      continue;
+    }
+
+    int64_t n = hi - lo + 1;
+    int64_t cnt = 0;
+    for (int64_t max_gap_in_stage = 1; max_gap_in_stage < n;
+         max_gap_in_stage += max_gap_in_stage) {
+      for (int64_t step = max_gap_in_stage; step > 0; step /= 2) {
+        for (int64_t j = step % max_gap_in_stage; j + step < n;
+             j += step + step) {
+          auto range = max_gap_in_stage + max_gap_in_stage;
+
+          for (int64_t i = 0; i < step; i++) {
+            auto lhs_idx = i + j;
+            auto rhs_idx = i + j + step;
+
+            if (rhs_idx >= n) {
+              break;
+            }
+
+            if (lhs_idx / range == rhs_idx / range) {
+              lhs_indices[cnt].emplace_back(lhs_idx + lo);
+              rhs_indices[cnt].emplace_back(rhs_idx + lo);
+            }
+          }
+        }
+        cnt += 1;
+      }
+    }
+  }
+
+  size_t num_operands = arr.size();
+  for (size_t i = 0; i < lhs_indices.size(); i++) {
+    if (lhs_indices[i].empty()) {
+      continue;
+    }
+
+    Index lhs_indice;
+    Index rhs_indice;
+
+    std::vector<spu::Value> values;
+    values.reserve(2 * num_operands);
+
+    for (size_t j = 0; j < num_operands; ++j) {
+      values.emplace_back(arr[j].data().linear_gather(lhs_indices[i]),
+                          arr[j].dtype());
+      values.emplace_back(arr[j].data().linear_gather(rhs_indices[i]),
+                          arr[j].dtype());
+    }
+    auto predicate = comparator_body(values);
+    auto _predicate = dump_public_as<bool>(ctx, hal::reveal(ctx, predicate));
+    for (size_t k = 0; k < _predicate.size(); k++) {
+      if (!_predicate[k]) {
+        lhs_indice.emplace_back(lhs_indices[i][k]);
+        rhs_indice.emplace_back(rhs_indices[i][k]);
+      }
+    }
+    Swap(arr, lhs_indice, rhs_indice);
+  }
+}
+
+std::vector<spu::Value> QuickMergesort(SPUContext *ctx, const int64_t num_keys,
+                                       const CompFn &quick_comp,
+                                       const CompFn &merge_comp,
+                                       absl::Span<spu::Value const> inputs) {
+  // we do not need to copy or _2s here because of the secret shuffling.
+  std::vector<spu::Value> ret(inputs.begin(), inputs.end());
+
+  const auto n = inputs.front().numel();
+  std::vector<std::pair<int64_t, int64_t>> intervals;
+  intervals.emplace_back(0, n - 1);
+  int64_t quicksort_num = 0;
+  // set max depth to avoid infinite loop
+  int64_t depth = 1000;
+  bool need_quick_sort = true;
+
+  while (!intervals.empty()) {
+    need_quick_sort =
+        Partition(ctx, num_keys, quick_comp, absl::MakeSpan(ret), intervals);
+    quicksort_num += 1;
+
+    if (!need_quick_sort || (quicksort_num == depth)) {
+      break;
+    }
+  }
+
+  if (intervals.empty()) {
+    return ret;
+  }
+
+  mergesort(ctx, merge_comp, absl::MakeSpan(ret), intervals);
+
+  return ret;
+}
+
+std::vector<spu::Value> PrepareSort(SPUContext *ctx,
+                                    absl::Span<spu::Value const> inputs) {
+  std::vector<spu::Value> inp;
+  inp.reserve(inputs.size());
+
+  auto rand_perm = _rand_perm_s(ctx, inputs.front().shape());
+  // use a random permutation to break link of values, such that the following
+  // comparison can be revealed without loss of information.
+  for (const auto &input : inputs) {
+    inp.emplace_back(
+        std::move(_perm_ss(ctx, input, rand_perm).setDtype(input.dtype())));
+  }
+
+  return inp;
+}
+
+std::vector<spu::Value> quick_sort(SPUContext *ctx,
+                                   absl::Span<spu::Value const> inputs,
+                                   int64_t num_keys, SortDirection direction) {
+  auto inp = PrepareSort(ctx, inputs);
+  // quick sort will append extra random key
+  auto quick_comp = _get_cmp_func(ctx, num_keys, direction, true);
+  // in merge sort stage, only normal keys are used for comparison
+  auto merge_comp = _get_cmp_func(ctx, num_keys, direction);
+  auto ret = QuickMergesort(ctx, num_keys, quick_comp, merge_comp,
+                            absl::MakeSpan(inp));
+  return ret;
+}
+
 void TwoWayPartition(SPUContext *ctx, const CompFn &comparator_body,
                      absl::Span<spu::Value> arr, int64_t lo, int64_t hi,
                      const TopKConfig &config,
@@ -684,9 +1091,11 @@ spu::Value _apply_inv_perm_ss(SPUContext *ctx, const spu::Value &x,
 // Compose is actually a special case of apply_perm where both inputs are
 // permutations. So to be more general, we use the name _apply_perm_ss
 // rather than _compose_ss here
-spu::Value _apply_perm_ss(SPUContext *ctx, const Value &x, const Value &perm) {
+std::vector<spu::Value> _apply_perm_ss(SPUContext *ctx,
+                                       absl::Span<spu::Value const> x,
+                                       const Value &perm) {
   // 1. <SP> = secure shuffle <perm>
-  auto shuffle_perm = hal::_rand_perm_s(ctx, x.shape());
+  auto shuffle_perm = hal::_rand_perm_s(ctx, x[0].shape());
   auto sp = hal::_perm_ss(ctx, perm, shuffle_perm);
 
   // 2. M = reveal(<SP>)
@@ -694,14 +1103,28 @@ spu::Value _apply_perm_ss(SPUContext *ctx, const Value &x, const Value &perm) {
   SPU_ENFORCE_EQ(m.shape().ndim(), 1U, "perm should be 1-d tensor");
 
   // 3. sx = apply_perm(x,m)
-  auto sx = hal::_perm_sp(ctx, x, m);
+  std::vector<spu::Value> sx;
+  sx.reserve(x.size());
+  for (const auto &item : x) {
+    sx.emplace_back(hal::_perm_sp(ctx, item, m));
+  }
 
   // 4. ret = unshuffle(<sx>)
-  auto ret = hal::_inv_perm_ss(ctx, sx, shuffle_perm);
+  std::vector<spu::Value> ret;
+  ret.reserve(x.size());
+  for (const auto &item : sx) {
+    ret.emplace_back(hal::_inv_perm_ss(ctx, item, shuffle_perm));
+  }
 
   return ret;
 }
 
+spu::Value _apply_perm_ss(SPUContext *ctx, const Value &x, const Value &perm) {
+  std::vector<spu::Value> inputs{x};
+  auto ret = _apply_perm_ss(ctx, inputs, perm);
+  return std::move(ret[0]);
+}
+
 // Find mergeable keys from keys. Consecutive public/private(belong to one
 // owner) keys can be merged. Assume there are six keys, i.e., public_key0,
 // bob_key0, bob_key1, alice_key0, alice_key1, secret_key0. We can merge the
@@ -768,10 +1191,37 @@ spu::Value _apply_inv_perm_sv(SPUContext *ctx, const Value &in,
   }
 }
 
-#define MAP_APPLY_PERM_OP(NAME)                             \
-  spu::Value _apply##NAME(SPUContext *ctx, const Value &in, \
-                          const Value &perm) {              \
-    return hal::NAME(ctx, in, perm);                        \
+std::vector<Value> _apply_inv_perm_sv(SPUContext *ctx,
+                                      absl::Span<Value const> inputs,
+                                      const Value &perm) {
+  if (ctx->hasKernel("inv_perm_av")) {
+    std::vector<spu::Value> ret;
+    ret.reserve(inputs.size());
+    for (const auto &input : inputs) {
+      ret.emplace_back(
+          _apply_inv_perm_sv(ctx, input, perm).setDtype(input.dtype()));
+    }
+    return ret;
+  } else {
+    return _apply_inv_perm_ss(ctx, inputs, _2s(ctx, perm));
+  }
+}
+
+#define MAP_APPLY_PERM_OP(NAME)                                             \
+  spu::Value _apply##NAME(SPUContext *ctx, const Value &in,                 \
+                          const Value &perm) {                              \
+    return hal::NAME(ctx, in, perm);                                        \
+  }                                                                         \
+                                                                            \
+  std::vector<Value> _apply##NAME(                                          \
+      SPUContext *ctx, absl::Span<Value const> inputs, const Value &perm) { \
+    std::vector<Value> ret;                                                 \
+    ret.reserve(inputs.size());                                             \
+    for (const auto &input : inputs) {                                      \
+      ret.emplace_back(                                                     \
+          _apply##NAME(ctx, input, perm).setDtype(input.dtype()));          \
+    }                                                                       \
+    return ret;                                                             \
   }
 
 MAP_APPLY_PERM_OP(_perm_pp);
@@ -781,41 +1231,87 @@ MAP_APPLY_PERM_OP(_inv_perm_pp);
 MAP_APPLY_PERM_OP(_inv_perm_vv);
 MAP_APPLY_PERM_OP(_inv_perm_sp);
 
+#define MAP_VEC_CONVERT_OP(NAME)                                             \
+  std::vector<Value> NAME(SPUContext *ctx, absl::Span<Value const> inputs) { \
+    std::vector<Value> ret;                                                  \
+    ret.reserve(inputs.size());                                              \
+    for (const auto &input : inputs) {                                       \
+      ret.emplace_back(hal::NAME(ctx, input).setDtype(input.dtype()));       \
+    }                                                                        \
+    return ret;                                                              \
+  }
+
+MAP_VEC_CONVERT_OP(_p2s);
+MAP_VEC_CONVERT_OP(_v2s);
+
+#undef MAP_VEC_CONVERT_OP
+
+std::vector<Value> _p2v(SPUContext *ctx, absl::Span<Value const> inputs,
+                        int owner) {
+  std::vector<Value> ret;
+  ret.reserve(inputs.size());
+  for (const auto &input : inputs) {
+    ret.emplace_back(hal::_p2v(ctx, input, owner).setDtype(input.dtype()));
+  }
+  return ret;
+}
+
 // Given a permutation, apply (inverse) permutation on a 1-d array input
-#define MAP_PERM_OP(NAME)                                                \
-  spu::Value NAME(SPUContext *ctx, const Value &in, const Value &perm) { \
-    SPU_TRACE_HAL_DISP(ctx, in, perm);                                   \
-    if (in.isPublic() && perm.isPublic()) { /*PP*/                       \
-      return NAME##_pp(ctx, in, perm);                                   \
-    } else if (in.isPublic() && perm.isSecret()) { /*PS*/                \
-      return NAME##_ss(ctx, _p2s(ctx, in), perm);                        \
-    } else if (in.isPublic() && perm.isPrivate()) { /*PV*/               \
-      return NAME##_vv(ctx, _p2v(ctx, in, _get_owner(perm)), perm);      \
-    } else if (in.isPrivate() && perm.isPrivate()) { /*VV*/              \
-      if (_has_same_owner(in, perm)) {                                   \
-        return NAME##_vv(ctx, in, perm);                                 \
-      } else {                                                           \
-        return NAME##_sv(ctx, _v2s(ctx, in), perm);                      \
-      }                                                                  \
-    } else if (in.isPrivate() && perm.isPublic()) { /*VP*/               \
-      return NAME##_vv(ctx, in, _p2v(ctx, perm, _get_owner(in)));        \
-    } else if (in.isPrivate() && perm.isSecret()) { /*VS*/               \
-      return NAME##_ss(ctx, _v2s(ctx, in), perm);                        \
-    } else if (in.isSecret() && perm.isSecret()) { /*SS*/                \
-      return NAME##_ss(ctx, in, perm);                                   \
-    } else if (in.isSecret() && perm.isPublic()) { /*SP*/                \
-      return NAME##_sp(ctx, in, perm);                                   \
-    } else if (in.isSecret() && perm.isPrivate()) { /*SV*/               \
-      return NAME##_sv(ctx, in, perm);                                   \
-    } else {                                                             \
-      SPU_THROW("should not be here");                                   \
-    }                                                                    \
+#define MAP_PERM_OP(NAME)                                                   \
+  std::vector<Value> NAME(SPUContext *ctx, absl::Span<Value const> in,      \
+                          const Value &perm) {                              \
+    SPU_ENFORCE(!in.empty(), "Inputs should not be empty");                 \
+    SPU_ENFORCE(std::all_of(in.begin(), in.end(),                           \
+                            [&in](const spu::Value &v) {                    \
+                              return v.vtype() == in[0].vtype();            \
+                            }),                                             \
+                "Inputs visibility mismatched");                            \
+    if (in[0].isPrivate()) {                                                \
+      SPU_ENFORCE(std::all_of(in.begin(), in.end(),                         \
+                              [&in](const spu::Value &v) {                  \
+                                return internal::_has_same_owner(v, in[0]); \
+                              }),                                           \
+                  "Inputs owner mismatched");                               \
+    }                                                                       \
+    SPU_TRACE_HAL_DISP(ctx, in[0], perm);                                   \
+    if (in[0].isPublic() && perm.isPublic()) { /*PP*/                       \
+      return NAME##_pp(ctx, in, perm);                                      \
+    } else if (in[0].isPublic() && perm.isSecret()) { /*PS*/                \
+      return NAME##_ss(ctx, _p2s(ctx, in), perm);                           \
+    } else if (in[0].isPublic() && perm.isPrivate()) { /*PV*/               \
+      return NAME##_vv(ctx, _p2v(ctx, in, _get_owner(perm)), perm);         \
+    } else if (in[0].isPrivate() && perm.isPrivate()) { /*VV*/              \
+      if (_has_same_owner(in[0], perm)) {                                   \
+        return NAME##_vv(ctx, in, perm);                                    \
+      } else {                                                              \
+        return NAME##_sv(ctx, _v2s(ctx, in), perm);                         \
+      }                                                                     \
+    } else if (in[0].isPrivate() && perm.isPublic()) { /*VP*/               \
+      return NAME##_vv(ctx, in, hal::_p2v(ctx, perm, _get_owner(in[0])));   \
+    } else if (in[0].isPrivate() && perm.isSecret()) { /*VS*/               \
+      return NAME##_ss(ctx, _v2s(ctx, in), perm);                           \
+    } else if (in[0].isSecret() && perm.isSecret()) { /*SS*/                \
+      return NAME##_ss(ctx, in, perm);                                      \
+    } else if (in[0].isSecret() && perm.isPublic()) { /*SP*/                \
+      return NAME##_sp(ctx, in, perm);                                      \
+    } else if (in[0].isSecret() && perm.isPrivate()) { /*SV*/               \
+      return NAME##_sv(ctx, in, perm);                                      \
+    } else {                                                                \
+      SPU_THROW("should not be here");                                      \
+    }                                                                       \
   }
 
 // Inverse permute 1-D array x with a permutation perm
 // ret[perm[i]] = x[i]
 MAP_PERM_OP(_apply_inv_perm)
 
+spu::Value _apply_inv_perm(SPUContext *ctx, const spu::Value &x,
+                           const spu::Value &perm) {
+  std::vector<spu::Value> inputs{x};
+  auto ret = _apply_inv_perm(ctx, inputs, perm);
+  return std::move(ret[0]);
+}
+
 // Given a permutation, generate its inverse permutation
 // ret[perm[i]] = i
 spu::Value _inverse(SPUContext *ctx, const Value &perm) {
@@ -829,14 +1325,27 @@ spu::Value _apply_perm_sv(SPUContext *ctx, const Value &in, const Value &perm) {
   if (ctx->hasKernel("inv_perm_av")) {
     return hal::_inv_perm_sv(ctx, in, _inverse(ctx, perm));
   } else {
-    return _apply_inv_perm_ss(ctx, in, _v2s(ctx, _inverse(ctx, perm)));
+    return _apply_inv_perm_ss(ctx, in, hal::_v2s(ctx, _inverse(ctx, perm)));
   }
 }
 
+std::vector<Value> _apply_perm_sv(SPUContext *ctx,
+                                  absl::Span<Value const> inputs,
+                                  const Value &perm) {
+  return _apply_inv_perm_sv(ctx, inputs, _inverse(ctx, perm));
+}
+
 // Permute 1-D array x with a permutation perm
 // ret[i] = x[perm[i]]
 MAP_PERM_OP(_apply_perm)
 
+spu::Value _apply_perm(SPUContext *ctx, const spu::Value &x,
+                       const spu::Value &perm) {
+  std::vector<spu::Value> inputs{x};
+  auto ret = _apply_perm(ctx, inputs, perm);
+  return std::move(ret[0]);
+}
+
 // Compose two permutations into one permutation
 // If we have two permutations x and y, we want to get a permutation z from x
 // and y that apply_inv_perm(in, z) = apply_inv_perm(apply_inv_perm(in, x), y)
@@ -844,6 +1353,8 @@ spu::Value _compose_perm(SPUContext *ctx, const Value &x, const Value &y) {
   return _apply_perm(ctx, y, x);
 }
 
+#undef MAP_PERM_OP
+
 spu::Value _merge_keys(SPUContext *ctx, absl::Span<Value const> inputs,
                        bool is_ascending) {
   if (inputs[0].isPublic()) {
@@ -1065,58 +1576,84 @@ std::vector<spu::Value> simple_sort1d(SPUContext *ctx,
   SPU_ENFORCE(num_keys > 0 && num_keys <= static_cast<int64_t>(inputs.size()),
               "num_keys {} is not valid", num_keys);
 
-  bool fallback = false;
-  // if all keys are public, fallback to public sort
+  std::vector<spu::Value> ret;
+  const auto sort_method = ctx->config().sort_method();
+
+  // There are multiple sort methods supported by SPU, we will try to seek the
+  // best method in the following order if the user does not specify the method
+  // manually.
+  //   1. If all keys are Public, then fallback to the plaintext sort.
+  //   2. Else, sequentially check if it supports radix sort or quick sort. If a
+  //   match is found, execute the corresponding algorithm; otherwise, the
+  //   default sorting network algorithm will be executed.
+  //
+  // Some takeaways about the above algorithm:
+  //   1. Radix sort is currently the only STABLE sorting algorithm, so we
+  //   choose it as the highest priority algorithm (as long as it is supported
+  //   by the underlying MPC protocol).
+  //   2. It's worth to know that quick sort is indeed faster than radix
+  //   sort when the field is FM64 or FM128 (When in FM32, radix sort is always
+  //   faster).
+  //   3. However, radix sort can be significantly accelerated if you set
+  //   the valid_bits when you know exactly the ranges of the keys.
+  //   4. Radix sort and quick sort are more friendly to multiple payloads but
+  //   not to multiple keys. Increasing one payload only adds one secret
+  //   shuffle; however, for n additional keys, the communication/time can be
+  //   roughly considered to multiply by n.
+  //   5. Quick sort is more adaptable to the expansion of the ring. When the
+  //   ring size doubles, the communication volume of quick sort nearly doubles,
+  //   and the number of rounds increases (poly) logarithmically. In contrast,
+  //   when the ring size doubles in radix sort, the communication （roughly）
+  //   quadruples and the number of rounds doubles.
+  //
+
+  // if all keys are public, fallback to plaintext sort.
   if (std::all_of(inputs.begin(), inputs.begin() + num_keys,
                   [](const spu::Value &v) { return v.isPublic(); })) {
-    fallback = true;
-  }
-  // If the protocol supports secret shuffle and unshuffle, we can use radix
-  // sort for fast 1-D sort. Otherwise, we fallback to generic sort1d
-  if (!fallback &&
-      !(ctx->hasKernel("rand_perm_m") && ctx->hasKernel("perm_am") &&
-        ctx->hasKernel("perm_ap") && ctx->hasKernel("inv_perm_am") &&
-        ctx->hasKernel("inv_perm_ap"))) {
-    fallback = true;
+    return internal::fallback_sort1d(ctx, inputs, num_keys, direction);
   }
-  if (!fallback) {
-    auto ret =
-        internal::radix_sort(ctx, inputs, direction, num_keys, valid_bits);
-    return ret;
+
+  // if use default sort method, trying to find the most best method
+  // currently, radix sort -> quick sort -> sorting network
+  if (sort_method == RuntimeConfig::SORT_DEFAULT) {
+    if (internal::_check_method_require(ctx, RuntimeConfig::SORT_RADIX)) {
+      ret = internal::radix_sort(ctx, inputs, direction, num_keys, valid_bits);
+    } else if (internal::_check_method_require(ctx,
+                                               RuntimeConfig::SORT_QUICK)) {
+      ret = internal::quick_sort(ctx, inputs, num_keys, direction);
+    } else if (internal::_check_method_require(
+                   ctx,
+                   RuntimeConfig::SORT_NETWORK)) {  // always true now.
+      ret = internal::fallback_sort1d(ctx, inputs, num_keys, direction);
+    } else {
+      SPU_THROW("should not reach here");
+    }
   } else {
-    auto scalar_cmp = [direction](spu::SPUContext *ctx, const spu::Value &lhs,
-                                  const spu::Value &rhs) {
-      if (direction == SortDirection::Ascending) {
-        return hal::less(ctx, lhs, rhs);
-      }
-      return hal::greater(ctx, lhs, rhs);
-    };
+    auto selected_method = internal::select_sort_method(ctx, sort_method);
+    if (selected_method != sort_method) {
+      SPDLOG_WARN(
+          "Manually set method: {}, which is not supported, falling back to "
+          "{}.",
+          sort_method, selected_method);
+    }
 
-    hal::CompFn comp_fn =
-        [ctx, num_keys,
-         &scalar_cmp](absl::Span<const spu::Value> values) -> spu::Value {
-      spu::Value pre_equal = hal::constant(ctx, true, DT_I1, values[0].shape());
-      spu::Value result = scalar_cmp(ctx, values[0], values[1]);
-      // the idea here is that if the two values of the last key is equal, than
-      // we compare the two values of the current key, and iteratively to update
-      // the result which indicates whether to swap values
-      for (int64_t idx = 2; idx < num_keys * 2; idx += 2) {
-        pre_equal = hal::bitwise_and(
-            ctx, pre_equal, hal::equal(ctx, values[idx - 2], values[idx - 1]));
-        auto current = scalar_cmp(ctx, values[idx], values[idx + 1]);
-        current = hal::bitwise_and(ctx, pre_equal, current);
-        result = hal::bitwise_or(ctx, result, current);
-      }
-      return result;
-    };
-    Visibility vis =
-        std::all_of(inputs.begin(), inputs.begin() + num_keys,
-                    [](const spu::Value &v) { return v.isPublic(); })
-            ? VIS_PUBLIC
-            : VIS_SECRET;
-    auto ret = sort1d(ctx, inputs, comp_fn, vis, false);
-    return ret;
+    switch (selected_method) {
+      case RuntimeConfig::SORT_RADIX:
+        ret =
+            internal::radix_sort(ctx, inputs, direction, num_keys, valid_bits);
+        break;
+      case RuntimeConfig::SORT_QUICK:
+        ret = internal::quick_sort(ctx, inputs, num_keys, direction);
+        break;
+      case RuntimeConfig::SORT_NETWORK:
+        ret = internal::fallback_sort1d(ctx, inputs, num_keys, direction);
+        break;
+      default:
+        SPU_THROW("should not reach here");
+    }
   }
+
+  return ret;
 }
 
 std::vector<spu::Value> permute(SPUContext *ctx,
@@ -1292,4 +1829,22 @@ std::vector<Value> topk_1d(SPUContext *ctx, const spu::Value &input,
   }
 }
 
+std::vector<spu::Value> apply_inv_permute_1d(
+    SPUContext *ctx, absl::Span<const spu::Value> inputs,
+    const spu::Value &perm) {
+  // Note: the kernel `inv_perm_am` in MPC layer is exactly the `unshuffle`
+  // semantics, and we implement `apply_inv_perm_ss` in HAL layer. So we wrap
+  // the `apply_inv_perm` to deal with the all inv_perm stuffs.
+  return internal::_apply_inv_perm(ctx, inputs, perm);
+}
+
+std::vector<spu::Value> apply_permute_1d(SPUContext *ctx,
+                                         absl::Span<const spu::Value> inputs,
+                                         const spu::Value &perm) {
+  // Note: the kernel `perm_am` in MPC layer is exactly the `shuffle`
+  // semantics, and we implement `apply_perm_ss` in HAL layer. So we wrap the
+  // `apply_perm` to deal with the all inv_perm stuffs.
+  return internal::_apply_perm(ctx, inputs, perm);
+}
+
 }  // namespace spu::kernel::hal
diff --git a/libspu/kernel/hal/permute.h b/libspu/kernel/hal/permute.h
index 9025bbec5..be3cd5395 100644
--- a/libspu/kernel/hal/permute.h
+++ b/libspu/kernel/hal/permute.h
@@ -79,4 +79,14 @@ std::vector<Value> topk_1d(SPUContext *ctx, const spu::Value &input,
                            const SimpleCompFn &scalar_cmp,
                            const TopKConfig &config);
 
+// For each input x, we get y = perm^{-1} (x), i.e. y[i] = x[perm^{-1}(i)]
+std::vector<spu::Value> apply_inv_permute_1d(
+    SPUContext *ctx, absl::Span<const spu::Value> inputs,
+    const spu::Value &perm);
+
+// For each input x, we get y = perm(x), i.e. y[i] = x[perm(i)]
+std::vector<spu::Value> apply_permute_1d(SPUContext *ctx,
+                                         absl::Span<const spu::Value> inputs,
+                                         const spu::Value &perm);
+
 }  // namespace spu::kernel::hal
\ No newline at end of file
diff --git a/libspu/kernel/hal/prot_wrapper.cc b/libspu/kernel/hal/prot_wrapper.cc
index 7e03454d1..e1e060e77 100644
--- a/libspu/kernel/hal/prot_wrapper.cc
+++ b/libspu/kernel/hal/prot_wrapper.cc
@@ -88,9 +88,9 @@ Value _rand_p(SPUContext* ctx, const Shape& shape) {
   return rnd;
 }
 
-Value _rand_s(SPUContext* ctx, const Shape& shape) {
+Value _rand_s(SPUContext* ctx, const Shape& shape, DataType dtype) {
   SPU_TRACE_HAL_DISP(ctx, shape);
-  auto rnd = mpc::rand_s(ctx, shape);
+  auto rnd = mpc::rand_s(ctx, shape, dtype);
   return rnd;
 }
 
diff --git a/libspu/kernel/hal/prot_wrapper.h b/libspu/kernel/hal/prot_wrapper.h
index a3c138cac..6c50fadb6 100644
--- a/libspu/kernel/hal/prot_wrapper.h
+++ b/libspu/kernel/hal/prot_wrapper.h
@@ -120,7 +120,7 @@ Value _bitrev_v(SPUContext* ctx, const Value& in, size_t start, size_t end);
 Value _make_p(SPUContext* ctx, uint128_t init, const Shape& shape);
 
 Value _rand_p(SPUContext* ctx, const Shape& shape);
-Value _rand_s(SPUContext* ctx, const Shape& shape);
+Value _rand_s(SPUContext* ctx, const Shape& shape, DataType dtype);
 
 // FIXME: temporary API, formalize later
 Value _rand_perm_s(SPUContext* ctx, const Shape& shape);
diff --git a/libspu/kernel/hal/random.cc b/libspu/kernel/hal/random.cc
index c614d191b..ddf0b5924 100644
--- a/libspu/kernel/hal/random.cc
+++ b/libspu/kernel/hal/random.cc
@@ -64,7 +64,7 @@ Value random(SPUContext* ctx, Visibility vis, DataType dtype,
   if (vis == VIS_PUBLIC) {
     ret = _rand_p(ctx, shape).setDtype(dtype);
   } else if (vis == VIS_SECRET) {
-    ret = _rand_s(ctx, shape).setDtype(dtype);
+    ret = _rand_s(ctx, shape, dtype).setDtype(dtype);
   } else {
     SPU_THROW("Invalid visibility={}", vis);
   }
diff --git a/libspu/kernel/hal/soprf.cc b/libspu/kernel/hal/soprf.cc
new file mode 100644
index 000000000..4e0356b9d
--- /dev/null
+++ b/libspu/kernel/hal/soprf.cc
@@ -0,0 +1,78 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libspu/kernel/hal/soprf.h"
+
+#include "libspu/core/trace.h"
+#include "libspu/kernel/hal/prot_wrapper.h"
+#include "libspu/kernel/hal/ring.h"
+
+namespace spu::kernel::hal {
+
+Value soprf(SPUContext* ctx, const Value& x) {
+  SPU_TRACE_HAL_LEAF(ctx, x);
+
+  // currently, wo only support LowMC block cipher
+  SPU_ENFORCE(ctx->hasKernel("lowmc_b"));
+  auto inp = x;
+
+  if (x.isPublic()) {
+    inp = _p2s(ctx, x);
+  } else if (x.isPrivate()) {
+    inp = _v2s(ctx, x);
+  }
+
+  auto ret = dynDispatch<spu::Value>(ctx, "lowmc_b", _prefer_b(ctx, inp));
+
+  return ret.setDtype(x.dtype());
+}
+
+namespace {
+spu::Value _2s(SPUContext* ctx, const Value& x) {
+  if (x.isPublic()) {
+    return _p2s(ctx, x);
+  } else if (x.isPrivate()) {
+    return _v2s(ctx, x);
+  }
+  return x;
+}
+}  // namespace
+
+Value soprf(SPUContext* ctx, absl::Span<const spu::Value> inputs) {
+  // currently, wo only support LowMC block cipher
+  SPU_ENFORCE(ctx->hasKernel("multi_key_lowmc_b"));
+  SPU_ENFORCE(!inputs.empty(), "inputs should not be empty");
+  SPU_ENFORCE(std::all_of(inputs.begin() + 1, inputs.end(),
+                          [&inputs](const spu::Value& v) {
+                            return v.shape() == inputs.front().shape();
+                          }),
+              "shape mismatch");
+  SPU_ENFORCE(std::all_of(inputs.begin() + 1, inputs.end(),
+                          [&inputs](const Value& v) {
+                            return v.dtype() == inputs.front().dtype();
+                          }),
+              "not all element has same dtype");
+
+  std::vector<Value> inp;
+  inp.reserve(inputs.size());
+  for (const auto& v : inputs) {
+    inp.push_back(_prefer_b(ctx, _2s(ctx, v)));
+  }
+
+  auto ret = dynDispatch<spu::Value>(ctx, "multi_key_lowmc_b", inp);
+
+  return ret.setDtype(inputs.front().dtype());
+}
+
+}  // namespace spu::kernel::hal
diff --git a/libspu/kernel/hal/soprf.h b/libspu/kernel/hal/soprf.h
new file mode 100644
index 000000000..02dd11ee1
--- /dev/null
+++ b/libspu/kernel/hal/soprf.h
@@ -0,0 +1,35 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "libspu/core/context.h"
+#include "libspu/core/value.h"
+
+namespace spu::kernel::hal {
+
+// Shared Oblivious PRF
+// ret = PRF(x, key), but with x, key in secret share.
+// now, `key` is generated inside kernel to guarantee the 128-bits security.
+Value soprf(SPUContext* ctx, const Value& x);
+
+// Multi-Key version of shared oblivious PRF
+// We use the scheme in:
+// REF: https://eprint.iacr.org/2019/518
+//
+// Warning: There may exist collision if you feed too many keys, although we
+// limit the probability to be less than 2^{-40} in almost situations;
+Value soprf(SPUContext* ctx, absl::Span<const spu::Value> inputs);
+
+}  // namespace spu::kernel::hal
diff --git a/libspu/kernel/hal/type_cast.cc b/libspu/kernel/hal/type_cast.cc
index 89ad0142c..b432b8641 100644
--- a/libspu/kernel/hal/type_cast.cc
+++ b/libspu/kernel/hal/type_cast.cc
@@ -80,7 +80,14 @@ Value reveal(SPUContext* ctx, const Value& x) {
 
 Value reveal_to(SPUContext* ctx, const Value& x, size_t rank) {
   SPU_TRACE_HAL_LEAF(ctx, x, rank);
-  SPU_ENFORCE(x.isSecret());
+  SPU_ENFORCE(!x.isPublic());
+  if (x.isPrivate()) {
+    if (x.owner() == static_cast<int64_t>(rank)) {
+      return x;
+    } else {
+      return _s2v(ctx, _v2s(ctx, x), rank).setDtype(x.dtype());
+    }
+  }
   return _s2v(ctx, x, rank).setDtype(x.dtype());
 }
 
diff --git a/libspu/kernel/hlo/BUILD.bazel b/libspu/kernel/hlo/BUILD.bazel
index d9ee64ca5..80c1c9bde 100644
--- a/libspu/kernel/hlo/BUILD.bazel
+++ b/libspu/kernel/hlo/BUILD.bazel
@@ -305,6 +305,7 @@ spu_cc_test(
         ":sort",
         "//libspu/kernel:test_util",
         "//libspu/kernel/hal:polymorphic",
+        "//libspu/mpc/utils:simulate",
     ],
 )
 
@@ -349,3 +350,46 @@ spu_cc_test(
         "//libspu/kernel:test_util",
     ],
 )
+
+spu_cc_library(
+    name = "soprf",
+    srcs = ["soprf.cc"],
+    hdrs = ["soprf.h"],
+    deps = [
+        ":geometrical",
+        "//libspu/kernel/hal:soprf",
+    ],
+)
+
+spu_cc_test(
+    name = "soprf_test",
+    srcs = ["soprf_test.cc"],
+    deps = [
+        ":casting",
+        ":const",
+        ":soprf",
+        "//libspu/kernel:test_util",
+        "//libspu/mpc/utils:simulate",
+    ],
+)
+
+spu_cc_library(
+    name = "permute",
+    srcs = ["permute.cc"],
+    hdrs = ["permute.h"],
+    deps = [
+        "//libspu/kernel/hal:permute",
+    ],
+)
+
+spu_cc_test(
+    name = "permute_test",
+    srcs = ["permute_test.cc"],
+    deps = [
+        ":casting",
+        ":const",
+        ":permute",
+        "//libspu/kernel:test_util",
+        "//libspu/mpc/utils:simulate",
+    ],
+)
diff --git a/libspu/kernel/hlo/casting_test.cc b/libspu/kernel/hlo/casting_test.cc
index 6e59b4b30..415055a42 100644
--- a/libspu/kernel/hlo/casting_test.cc
+++ b/libspu/kernel/hlo/casting_test.cc
@@ -59,7 +59,8 @@ INSTANTIATE_TEST_SUITE_P(
     CastingTestInstances, CastingTest,
     testing::Combine(testing::Values(FieldType::FM64, FieldType::FM128),
                      testing::Values(ProtocolKind::REF2K, ProtocolKind::SEMI2K,
-                                     ProtocolKind::ABY3)),
+                                     ProtocolKind::ABY3,
+                                     ProtocolKind::CHEETAH)),
     [](const testing::TestParamInfo<CastingTest::ParamType> &p) {
       return fmt::format("{}x{}", std::get<0>(p.param), std::get<1>(p.param));
     });
diff --git a/libspu/kernel/hlo/permute.cc b/libspu/kernel/hlo/permute.cc
new file mode 100644
index 000000000..3a6dba272
--- /dev/null
+++ b/libspu/kernel/hlo/permute.cc
@@ -0,0 +1,56 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libspu/kernel/hlo/permute.h"
+
+#include "libspu/core/context.h"
+
+namespace spu::kernel::hlo {
+
+namespace {
+
+bool check_permute_kernel(SPUContext* ctx) {
+  // TODO: Do checks according to visibility of inputs and perm later.
+  return ctx->hasKernel("rand_perm_m") && ctx->hasKernel("perm_am") &&
+         ctx->hasKernel("perm_ap") && ctx->hasKernel("inv_perm_am") &&
+         ctx->hasKernel("inv_perm_ap");
+}
+}  // namespace
+
+std::vector<spu::Value> InvPermute(SPUContext* ctx,
+                                   absl::Span<const spu::Value> inputs,
+                                   const spu::Value& perm, int64_t perm_dim) {
+  SPU_ENFORCE(check_permute_kernel(ctx),
+              "permute related kernel not supported");
+
+  auto inv_perm_fn = [&](absl::Span<const spu::Value> input) {
+    return hal::apply_inv_permute_1d(ctx, input, perm);
+  };
+
+  return hal::permute(ctx, inputs, perm_dim, inv_perm_fn);
+};
+
+std::vector<spu::Value> Permute(SPUContext* ctx,
+                                absl::Span<const spu::Value> inputs,
+                                const spu::Value& perm, int64_t perm_dim) {
+  SPU_ENFORCE(check_permute_kernel(ctx),
+              "permute related kernel not supported");
+
+  auto perm_fn = [&](absl::Span<const spu::Value> input) {
+    return hal::apply_permute_1d(ctx, input, perm);
+  };
+
+  return hal::permute(ctx, inputs, perm_dim, perm_fn);
+}
+}  // namespace spu::kernel::hlo
diff --git a/libspu/kernel/hlo/permute.h b/libspu/kernel/hlo/permute.h
new file mode 100644
index 000000000..4f8189a28
--- /dev/null
+++ b/libspu/kernel/hlo/permute.h
@@ -0,0 +1,48 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "libspu/kernel/hal/permute.h"
+
+namespace spu::kernel::hlo {
+
+// Inverse permute vector `inputs` over permutation `perm`
+// Let [n] = {0,1,2,...,n-1}, then perm: [n] -> [n] should be an invertible
+// permutation, we denote prem^{-1} as its inversion.
+// For each input x, we get y = perm^{-1} (x), i.e. y[i] = x[perm^{-1}(i)]
+//
+// Note: to simplify the implementation, we FORCE the visibility of inputs to be
+// the SAME (for Private, the OWNER should also be the SAME).
+// IMPORTANT NOTE: when perm is Private (owner i), and inputs include some mix
+// of either Secret or Private (with owner j != i), you should Seal the Private
+// inputs (with owner j != i) first, and do permute once to improve performance.
+std::vector<spu::Value> InvPermute(SPUContext* ctx,
+                                   absl::Span<const spu::Value> inputs,
+                                   const spu::Value& perm, int64_t perm_dim);
+
+// Permute vector `inputs` over permutation `perm`
+// For each input x, we get y = perm(x), i.e. y[i] = x[perm(i)]
+// Note: to simplify the implementation, we force the visibility of inputs to be
+// the same (for Private, the owner should also be the same).
+//
+// Note: to simplify the implementation, we FORCE the visibility of inputs to be
+// the SAME (for Private, the OWNER should also be the SAME).
+// IMPORTANT NOTE: when perm is Private (owner i), and inputs include some mix
+// of either Secret or Private (with owner j != i), you should Seal the Private
+// inputs (with owner j != i) first, and do permute once to improve performance.
+std::vector<spu::Value> Permute(SPUContext* ctx,
+                                absl::Span<const spu::Value> inputs,
+                                const spu::Value& perm, int64_t perm_dim);
+}  // namespace spu::kernel::hlo
diff --git a/libspu/kernel/hlo/permute_test.cc b/libspu/kernel/hlo/permute_test.cc
new file mode 100644
index 000000000..c1b374183
--- /dev/null
+++ b/libspu/kernel/hlo/permute_test.cc
@@ -0,0 +1,352 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libspu/kernel/hlo/permute.h"
+
+#include "gtest/gtest.h"
+#include "xtensor/xio.hpp"
+
+#include "libspu/core/encoding.h"
+#include "libspu/kernel/hal/constants.h"
+#include "libspu/kernel/hlo/casting.h"
+#include "libspu/kernel/hlo/const.h"
+#include "libspu/kernel/test_util.h"
+#include "libspu/mpc/utils/simulate.h"
+
+namespace spu::kernel::hlo {
+
+namespace {
+
+using PermuteFunc = std::function<std::vector<Value>(
+    SPUContext*, absl::Span<const spu::Value> inputs, const spu::Value& perm,
+    int64_t perm_dim)>;
+
+const FieldType kField = FM64;
+
+enum class VisType {
+  VisPriv0 = 0,  // private, own by party 0
+  VisPriv1 = 1,  // private, own by party 1
+  VisPub = 2,
+  VisSec = 3,
+};
+
+const std::vector<VisType> kVisTypes = {VisType::VisPub, VisType::VisSec,
+                                        VisType::VisPriv0, VisType::VisPriv1};
+
+inline std::string get_vis_str(VisType type) {
+  switch (type) {
+    case VisType::VisPub:
+      return "VisPub";
+    case VisType::VisSec:
+      return "VisSec";
+    case VisType::VisPriv0:
+      return "VisPriv0";
+    case VisType::VisPriv1:
+      return "VisPriv1";
+    default:
+      return "Unknown";
+  }
+}
+
+bool checkCommFree(VisType x_vis, VisType perm_vis) {
+  // Permutation is comm. free if:
+  //  1. perm is Public
+  //  2. perm is Private, x is Public or Private with same owner
+  if (perm_vis == VisType::VisPub) {
+    return true;
+  } else if (perm_vis == VisType::VisPriv0 &&
+             (x_vis == VisType::VisPriv0 || x_vis == VisType::VisPub)) {
+    return true;
+  } else if (perm_vis == VisType::VisPriv1 &&
+             (x_vis == VisType::VisPriv1 || x_vis == VisType::VisPub)) {
+    return true;
+  }
+
+  return false;
+}
+
+bool checkSpPass(VisType x_vis, VisType perm_vis) {
+  // `inv_perm_av` will hit, when:
+  //   1. perm is Private and x is Secret
+  //   2. perm is Private and x is Private with different owner
+  if (perm_vis == VisType::VisPriv0) {
+    if (x_vis == VisType::VisSec || x_vis == VisType::VisPriv1) {
+      return true;
+    }
+  } else if (perm_vis == VisType::VisPriv1) {
+    if (x_vis == VisType::VisSec || x_vis == VisType::VisPriv0) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+Value makeTestValue(SPUContext* ctx, PtBufferView init, VisType vis) {
+  DataType dtype = getEncodeType(init.pt_type);
+
+  auto res = hal::constant(ctx, init, dtype, {});
+
+  switch (vis) {
+    case VisType::VisPub:
+      return res;
+    case VisType::VisSec: {
+      return Seal(ctx, res);
+    }
+    case VisType::VisPriv0: {
+      res = Seal(ctx, res);
+      return RevealTo(ctx, res, 0);
+    }
+    case VisType::VisPriv1: {
+      res = Seal(ctx, res);
+      return RevealTo(ctx, res, 1);
+    }
+    default:
+      SPU_THROW("Unknown vis type");
+  }
+}
+
+template <typename T>
+xt::xarray<T> evalSinglePermuteOp(SPUContext* ctx, VisType x_vis,
+                                  VisType perm_vis, PtBufferView x,
+                                  PtBufferView perm,
+                                  const PermuteFunc& perm_func,
+                                  int64_t perm_dim = 0) {
+  auto x_v = makeTestValue(ctx, x, x_vis);
+  auto perm_v = makeTestValue(ctx, perm, perm_vis);
+
+  size_t send_round = ctx->lctx()->GetStats()->sent_actions;
+  size_t recv_round = ctx->lctx()->GetStats()->recv_actions;
+  auto perm_ret = perm_func(ctx, {x_v}, perm_v, perm_dim);
+  send_round = ctx->lctx()->GetStats()->sent_actions - send_round;
+  recv_round = ctx->lctx()->GetStats()->recv_actions - recv_round;
+
+  // test whether hit the proper kernel.
+  if (checkCommFree(x_vis, perm_vis)) {
+    EXPECT_EQ(send_round, 0);
+  }
+  if (ctx->hasKernel("inv_perm_av") && checkSpPass(x_vis, perm_vis)) {
+    auto n_repeat = x_v.shape().numel() / x_v.shape().dim(perm_dim);
+    // For ss version, at least 3 rounds.
+    EXPECT_LE(std::min(send_round, recv_round), 2 * n_repeat);
+  }
+  EXPECT_EQ(perm_ret.size(), 1);
+
+  auto ret = perm_ret[0];
+  if (!ret.isPublic()) {
+    ret = Reveal(ctx, ret);
+  }
+  EXPECT_TRUE(ret.isPublic());
+
+  return hal::dump_public_as<T>(ctx, ret);
+}
+
+template <typename T>
+std::vector<xt::xarray<T>> evalMultiplePermuteOp(
+    SPUContext* ctx, VisType x_vis, VisType perm_vis, PtBufferView x,
+    PtBufferView perm, const PermuteFunc& perm_func, int64_t perm_dim = 0) {
+  std::vector<Value> x_vec;
+  x_vec.reserve(4);
+  x_vec.push_back(makeTestValue(ctx, x, x_vis));
+  x_vec.push_back(makeTestValue(ctx, x, x_vis));
+  x_vec.push_back(makeTestValue(ctx, x, x_vis));
+  x_vec.push_back(makeTestValue(ctx, x, x_vis));
+
+  auto perm_v = makeTestValue(ctx, perm, perm_vis);
+
+  auto perm_ret = perm_func(ctx, x_vec, perm_v, perm_dim);
+  EXPECT_EQ(perm_ret.size(), 4);
+
+  std::vector<xt::xarray<T>> ret_vec;
+  for (auto ret : perm_ret) {
+    if (!ret.isPublic()) {
+      ret = Reveal(ctx, ret);
+    }
+    EXPECT_TRUE(ret.isPublic());
+    ret_vec.push_back(hal::dump_public_as<T>(ctx, ret));
+  }
+
+  return ret_vec;
+}
+
+}  // namespace
+
+class PermuteTest : public ::testing::TestWithParam<
+                        std::tuple<VisType, VisType, ProtocolKind, size_t>> {};
+
+INSTANTIATE_TEST_SUITE_P(
+    GeneralPermute, PermuteTest,
+    testing::Combine(testing::ValuesIn(kVisTypes),   // vis of x
+                     testing::ValuesIn(kVisTypes),   // vis of perm
+                     testing::Values(SEMI2K, ABY3),  // underlying protocol
+                     testing::Values(2, 3)  // npc=2 is not valid in ABY3
+                     ),
+    [](const testing::TestParamInfo<PermuteTest::ParamType>& p) {
+      return fmt::format("{}x{}x{}x{}", get_vis_str(std::get<0>(p.param)),
+                         get_vis_str(std::get<1>(p.param)),
+                         std::get<2>(p.param), std::get<3>(p.param));
+    });
+
+TEST_P(PermuteTest, SinglePermuteWork) {
+  const VisType x_vis = std::get<0>(GetParam());
+  const VisType perm_vis = std::get<1>(GetParam());
+  const ProtocolKind protocol = std::get<2>(GetParam());
+  const size_t npc = std::get<3>(GetParam());
+
+  if (protocol == ABY3 && npc == 2) {
+    return;
+  }
+
+  xt::xarray<int64_t> x = {10, 0, 2, 3, 9, 1, 5, 6};
+  xt::xarray<int64_t> perm = {2, 7, 1, 6, 0, 4, 3, 5};
+
+  xt::xarray<int64_t> expected_inv_perm = {9, 2, 10, 5, 1, 6, 3, 0};
+  xt::xarray<int64_t> expected_perm = {2, 6, 0, 5, 10, 9, 3, 1};
+
+  mpc::utils::simulate(
+      npc, [&](const std::shared_ptr<yacl::link::Context>& lctx) {
+        SPUContext sctx = test::makeSPUContext(protocol, kField, lctx);
+
+        // test of inv_permute
+        auto inv_perm_ret = evalSinglePermuteOp<int64_t>(&sctx, x_vis, perm_vis,
+                                                         x, perm, InvPermute);
+        EXPECT_TRUE(xt::allclose(expected_inv_perm, inv_perm_ret, 0.001, 0.001))
+            << expected_inv_perm << std::endl
+            << inv_perm_ret << std::endl;
+
+        // test of permute
+        auto perm_ret = evalSinglePermuteOp<int64_t>(&sctx, x_vis, perm_vis, x,
+                                                     perm, Permute);
+        EXPECT_TRUE(xt::allclose(expected_perm, perm_ret, 0.001, 0.001))
+            << expected_perm << std::endl
+            << perm_ret << std::endl;
+      });
+}
+
+TEST_P(PermuteTest, PermDimWork) {
+  const VisType x_vis = std::get<0>(GetParam());
+  const VisType perm_vis = std::get<1>(GetParam());
+  const ProtocolKind protocol = std::get<2>(GetParam());
+  const size_t npc = std::get<3>(GetParam());
+
+  if (protocol == ABY3 && npc == 2) {
+    return;
+  }
+
+  xt::xarray<int64_t> x = {{10, 0, 2, 3, 9, 1, 5, 6},
+                           {-10, 0, -2, -3, -9, -1, -5, -6}};
+  xt::xarray<int64_t> perm = {2, 7, 1, 6, 0, 4, 3, 5};
+
+  xt::xarray<int64_t> expected_inv_perm = {{9, 2, 10, 5, 1, 6, 3, 0},
+                                           {-9, -2, -10, -5, -1, -6, -3, -0}};
+  xt::xarray<int64_t> expected_perm = {{2, 6, 0, 5, 10, 9, 3, 1},
+                                       {-2, -6, -0, -5, -10, -9, -3, -1}};
+
+  mpc::utils::simulate(
+      npc, [&](const std::shared_ptr<yacl::link::Context>& lctx) {
+        SPUContext sctx = test::makeSPUContext(protocol, kField, lctx);
+
+        // test of inv_permute
+        auto inv_perm_ret = evalSinglePermuteOp<int64_t>(
+            &sctx, x_vis, perm_vis, x, perm, InvPermute, /*perm_dim*/ 1);
+        EXPECT_TRUE(xt::allclose(expected_inv_perm, inv_perm_ret, 0.001, 0.001))
+            << expected_inv_perm << std::endl
+            << inv_perm_ret << std::endl;
+
+        // test of permute
+        auto perm_ret = evalSinglePermuteOp<int64_t>(
+            &sctx, x_vis, perm_vis, x, perm, Permute, /*perm_dim*/ 1);
+        EXPECT_TRUE(xt::allclose(expected_perm, perm_ret, 0.001, 0.001))
+            << expected_perm << std::endl
+            << perm_ret << std::endl;
+      });
+}
+
+TEST_P(PermuteTest, MultiplePermuteWork) {
+  const VisType x_vis = std::get<0>(GetParam());
+  const VisType perm_vis = std::get<1>(GetParam());
+  const ProtocolKind protocol = std::get<2>(GetParam());
+  const size_t npc = std::get<3>(GetParam());
+
+  if (protocol == ABY3 && npc == 2) {
+    return;
+  }
+
+  xt::xarray<int64_t> x = {10, 0, 2, 3, 9, 1, 5, 6};
+  xt::xarray<int64_t> perm = {2, 7, 1, 6, 0, 4, 3, 5};
+
+  xt::xarray<int64_t> expected_inv_perm = {9, 2, 10, 5, 1, 6, 3, 0};
+  xt::xarray<int64_t> expected_perm = {2, 6, 0, 5, 10, 9, 3, 1};
+
+  mpc::utils::simulate(
+      npc, [&](const std::shared_ptr<yacl::link::Context>& lctx) {
+        SPUContext sctx = test::makeSPUContext(protocol, kField, lctx);
+
+        // test of inv_permute
+        auto inv_perm_ret_vec = evalMultiplePermuteOp<int64_t>(
+            &sctx, x_vis, perm_vis, x, perm, InvPermute);
+        for (const auto& inv_perm_ret : inv_perm_ret_vec) {
+          EXPECT_TRUE(
+              xt::allclose(expected_inv_perm, inv_perm_ret, 0.001, 0.001))
+              << expected_inv_perm << std::endl
+              << inv_perm_ret << std::endl;
+        }
+
+        // test of permute
+        auto perm_ret_vec = evalMultiplePermuteOp<int64_t>(
+            &sctx, x_vis, perm_vis, x, perm, Permute);
+        for (const auto& perm_ret : perm_ret_vec) {
+          EXPECT_TRUE(xt::allclose(expected_perm, perm_ret, 0.001, 0.001))
+              << expected_perm << std::endl
+              << perm_ret << std::endl;
+        }
+      });
+}
+
+class PermuteEmptyTest : public ::testing::TestWithParam<ProtocolKind> {};
+
+INSTANTIATE_TEST_SUITE_P(
+    PermuteEmpty, PermuteEmptyTest,
+    testing::Values(ProtocolKind::SEMI2K, ProtocolKind::ABY3),
+    [](const testing::TestParamInfo<PermuteEmptyTest::ParamType>& p) {
+      return fmt::format("{}", p.param);
+    });
+
+TEST_P(PermuteEmptyTest, Empty) {
+  ProtocolKind prot = GetParam();
+
+  mpc::utils::simulate(
+      3, [&](const std::shared_ptr<yacl::link::Context>& lctx) {
+        SPUContext sctx = test::makeSPUContext(prot, kField, lctx);
+
+        auto empty_x =
+            Seal(&sctx, Constant(&sctx, static_cast<int64_t>(1), {0}));
+        auto empty_perm =
+            Seal(&sctx, Constant(&sctx, static_cast<int64_t>(0), {0}));
+
+        auto empty_inv_perm_x = InvPermute(&sctx, {empty_x}, empty_perm, 0);
+        EXPECT_EQ(empty_inv_perm_x.size(), 1);
+        EXPECT_EQ(empty_inv_perm_x[0].numel(), 0);
+        EXPECT_EQ(empty_inv_perm_x[0].shape().size(), 1);
+        EXPECT_EQ(empty_inv_perm_x[0].shape()[0], 0);
+
+        auto empty_perm_x = Permute(&sctx, {empty_x}, empty_perm, 0);
+        EXPECT_EQ(empty_perm_x.size(), 1);
+        EXPECT_EQ(empty_perm_x[0].numel(), 0);
+        EXPECT_EQ(empty_perm_x[0].shape().size(), 1);
+        EXPECT_EQ(empty_perm_x[0].shape()[0], 0);
+      });
+}
+
+}  // namespace spu::kernel::hlo
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_stream.h b/libspu/kernel/hlo/soprf.cc
similarity index 58%
rename from libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_stream.h
rename to libspu/kernel/hlo/soprf.cc
index 04dcc88e0..953732420 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_stream.h
+++ b/libspu/kernel/hlo/soprf.cc
@@ -12,20 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+#include "libspu/kernel/hlo/soprf.h"
 
-#include <cstdint>
+#include "libspu/kernel/hlo/geometrical.h"
 
-namespace spu::mpc::semi2k::beaver::ttp_server {
+namespace spu::kernel::hlo {
 
-constexpr size_t kReplayChunkSize = 50 * 1024 * 1024;  // bytes
+Value SoPrf(SPUContext* ctx, const Value& x) { return hal::soprf(ctx, x); }
 
-constexpr size_t kUpStreamChunkSize = 50 * 1024 * 1024;    // bytes
-constexpr size_t kDownStreamChunkSize = 50 * 1024 * 1024;  // bytes
+Value SoPrf(SPUContext* ctx, absl::Span<const spu::Value> inputs) {
+  return hal::soprf(ctx, inputs);
+}
 
-// A list of buffer streams
-struct BeaverDownStreamMeta {
-  int32_t err_code = 0;
-};
-
-}  // namespace spu::mpc::semi2k::beaver::ttp_server
\ No newline at end of file
+}  // namespace spu::kernel::hlo
diff --git a/libspu/kernel/hlo/soprf.h b/libspu/kernel/hlo/soprf.h
new file mode 100644
index 000000000..1a673b337
--- /dev/null
+++ b/libspu/kernel/hlo/soprf.h
@@ -0,0 +1,37 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "libspu/kernel/hal/soprf.h"
+
+namespace spu::kernel::hlo {
+
+// shared oblivious PRF
+// ret = PRF(x, key), but with x, key in secret share.
+// However, for safety, key should be 128 bits long, but `x` may be 64 bits or
+// even 32 bits, it's hard to pass another `key` param with FM128, so we just
+// generate a shared key inside the kernel.
+// TODO: add `key` as a param
+Value SoPrf(SPUContext* ctx, const Value& x);
+
+// Multi-Key version of shared oblivious PRF
+// We use the scheme in:
+// REF: https://eprint.iacr.org/2019/518
+//
+// Warning: There may exist collision if you feed too many keys, although we
+// limit the probability to be less than 2^{-40} in almost situations;
+Value SoPrf(SPUContext* ctx, absl::Span<const spu::Value> inputs);
+
+}  // namespace spu::kernel::hlo
diff --git a/libspu/kernel/hlo/soprf_test.cc b/libspu/kernel/hlo/soprf_test.cc
new file mode 100644
index 000000000..6c4534421
--- /dev/null
+++ b/libspu/kernel/hlo/soprf_test.cc
@@ -0,0 +1,120 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libspu/kernel/hlo/soprf.h"
+
+#include "gtest/gtest.h"
+#include "xtensor/xio.hpp"
+
+#include "libspu/kernel/hlo/casting.h"
+#include "libspu/kernel/hlo/const.h"
+#include "libspu/kernel/test_util.h"
+#include "libspu/mpc/utils/ring_ops.h"
+#include "libspu/mpc/utils/simulate.h"
+
+namespace spu::kernel::hlo {
+
+class SoPrfTest
+    : public ::testing::TestWithParam<std::tuple<FieldType, ProtocolKind>> {};
+
+INSTANTIATE_TEST_SUITE_P(
+    SoPrfTestInstances, SoPrfTest,
+    testing::Combine(testing::Values(FieldType::FM64, FieldType::FM128),
+                     testing::Values(ProtocolKind::SEMI2K)),
+    [](const testing::TestParamInfo<SoPrfTest::ParamType> &p) {
+      return fmt::format("{}x{}", std::get<0>(p.param), std::get<1>(p.param));
+    });
+
+TEST_P(SoPrfTest, EmptyWork) {
+  FieldType field = std::get<0>(GetParam());
+  ProtocolKind prot = std::get<1>(GetParam());
+
+  mpc::utils::simulate(
+      3, [&](const std::shared_ptr<yacl::link::Context> &lctx) {
+        SPUContext sctx = test::makeSPUContext(prot, field, lctx);
+
+        auto empty_x = Seal(&sctx, Constant(&sctx, 1, {0}));
+        auto empty_ret = SoPrf(&sctx, empty_x);
+
+        EXPECT_EQ(empty_ret.numel(), 0);
+        EXPECT_EQ(empty_ret.shape().size(), 1);
+        EXPECT_EQ(empty_ret.shape()[0], 0);
+      });
+}
+
+TEST_P(SoPrfTest, Work) {
+  FieldType field = std::get<0>(GetParam());
+  ProtocolKind prot = std::get<1>(GetParam());
+
+  mpc::utils::simulate(
+      3, [&](const std::shared_ptr<yacl::link::Context> &lctx) {
+        SPUContext sctx = test::makeSPUContext(prot, field, lctx);
+
+        const Shape shape = {20, 17};
+        xt::xarray<uint64_t> x = xt::random::randint<uint64_t>(shape, 0);
+        xt::xarray<uint64_t> y = xt::random::randint<uint64_t>(shape, 0);
+
+        auto x_share = Seal(&sctx, Constant(&sctx, x, shape));
+        auto y_share = Seal(&sctx, Constant(&sctx, y, shape));
+
+        auto ret_x = SoPrf(&sctx, x_share);
+        auto ret_y = SoPrf(&sctx, y_share);
+        EXPECT_EQ(ret_x.shape(), shape);
+        EXPECT_EQ(ret_x.shape(), ret_y.shape());
+
+        auto ret_x_pub = Reveal(&sctx, ret_x);
+        auto ret_y_pub = Reveal(&sctx, ret_y);
+
+        EXPECT_FALSE(mpc::ring_all_equal(ret_x_pub.data(), ret_y_pub.data()));
+      });
+}
+
+class MultiKeySoPrfTest : public ::testing::TestWithParam<
+                              std::tuple<FieldType, ProtocolKind, size_t>> {};
+
+INSTANTIATE_TEST_SUITE_P(
+    MultiKeySoPrfTestInstances, MultiKeySoPrfTest,
+    testing::Combine(testing::Values(FieldType::FM64, FieldType::FM128),
+                     testing::Values(ProtocolKind::SEMI2K),
+                     testing::Values(1, 2, 4)),  // num of keys
+    [](const testing::TestParamInfo<MultiKeySoPrfTest::ParamType> &p) {
+      return fmt::format("{}x{}x{}", std::get<0>(p.param), std::get<1>(p.param),
+                         std::get<2>(p.param));
+    });
+
+TEST_P(MultiKeySoPrfTest, Work) {
+  FieldType field = std::get<0>(GetParam());
+  ProtocolKind prot = std::get<1>(GetParam());
+  size_t num_keys = std::get<2>(GetParam());
+
+  mpc::utils::simulate(
+      3, [&](const std::shared_ptr<yacl::link::Context> &lctx) {
+        SPUContext sctx = test::makeSPUContext(prot, field, lctx);
+
+        const Shape shape = {20, 17};
+        std::vector<spu::Value> inputs;
+        inputs.reserve(num_keys);
+        for (size_t i = 0; i < num_keys; ++i) {
+          xt::xarray<uint64_t> tmp = xt::random::randint<uint64_t>(shape, 0);
+          auto v = Seal(&sctx, Constant(&sctx, tmp, shape));
+          inputs.push_back(v);
+        }
+
+        auto ret = SoPrf(&sctx, absl::MakeSpan(inputs));
+
+        EXPECT_EQ(ret.shape(), shape);
+      });
+}
+
+}  // namespace spu::kernel::hlo
diff --git a/libspu/kernel/hlo/sort_test.cc b/libspu/kernel/hlo/sort_test.cc
index cabcb4fdc..89519b9e6 100644
--- a/libspu/kernel/hlo/sort_test.cc
+++ b/libspu/kernel/hlo/sort_test.cc
@@ -27,7 +27,14 @@
 #include "libspu/kernel/hlo/casting.h"
 #include "libspu/kernel/hlo/const.h"
 #include "libspu/kernel/test_util.h"
+#include "libspu/mpc/utils/simulate.h"
 
+// to print method name
+std::ostream &operator<<(std::ostream &os,
+                         spu::RuntimeConfig_SortMethod method) {
+  os << spu::RuntimeConfig::SortMethod_Name(method);
+  return os;
+}
 namespace spu::kernel::hlo {
 
 TEST(SortTest, Simple) {
@@ -228,34 +235,125 @@ TEST(SortTest, LargeNumel) {
   }
 }
 
-TEST(SimpleSortTest, MultiOperands) {
-  SPUContext ctx = test::makeSPUContext();
-  xt::xarray<float> k1 = {7, 6, 5, 5, 4, 4, 4, 1, 3, 3};
-  xt::xarray<float> k2 = {1, 2, 3, 6, 7, 6, 5, 2, 1, 2};
+class SimpleSortTest
+    : public ::testing::TestWithParam<std::tuple<
+          size_t, FieldType, ProtocolKind, RuntimeConfig::SortMethod>> {};
 
-  xt::xarray<float> sorted_k1 = {1, 3, 3, 4, 4, 4, 5, 5, 6, 7};
-  xt::xarray<float> sorted_k2 = {2, 1, 2, 5, 6, 7, 3, 6, 2, 1};
+TEST_P(SimpleSortTest, MultiOperands) {
+  size_t npc = std::get<0>(GetParam());
+  FieldType field = std::get<1>(GetParam());
+  ProtocolKind prot = std::get<2>(GetParam());
+  RuntimeConfig::SortMethod method = std::get<3>(GetParam());
 
-  Value k1_v = test::makeValue(&ctx, k1, VIS_SECRET);
-  Value k2_v = test::makeValue(&ctx, k2, VIS_SECRET);
+  mpc::utils::simulate(
+      npc, [&](const std::shared_ptr<yacl::link::Context> &lctx) {
+        RuntimeConfig cfg;
+        cfg.set_protocol(prot);
+        cfg.set_field(field);
+        cfg.set_enable_action_trace(false);
+        cfg.set_sort_method(method);
+        SPUContext ctx = test::makeSPUContext(cfg, lctx);
 
-  std::vector<spu::Value> rets =
-      SimpleSort(&ctx, {k1_v, k2_v}, 0, hal::SortDirection::Ascending, 2);
+        xt::xarray<float> k1 = {7, 6, 5, 5, 4, 4, 4, 1, 3, 3};
+        xt::xarray<float> k2 = {1, 2, 3, 6, 7, 6, 5, 2, 1, 2};
 
-  EXPECT_EQ(rets.size(), 2);
+        xt::xarray<float> sorted_k1 = {1, 3, 3, 4, 4, 4, 5, 5, 6, 7};
+        xt::xarray<float> sorted_k2 = {2, 1, 2, 5, 6, 7, 3, 6, 2, 1};
 
-  auto sorted_k1_hat =
-      hal::dump_public_as<float>(&ctx, hal::reveal(&ctx, rets[0]));
-  auto sorted_k2_hat =
-      hal::dump_public_as<float>(&ctx, hal::reveal(&ctx, rets[1]));
+        Value k1_v = test::makeValue(&ctx, k1, VIS_SECRET);
+        Value k2_v = test::makeValue(&ctx, k2, VIS_SECRET);
 
-  EXPECT_TRUE(xt::allclose(sorted_k1, sorted_k1_hat, 0.01, 0.001))
-      << sorted_k1 << std::endl
-      << sorted_k1_hat << std::endl;
+        std::vector<spu::Value> rets =
+            SimpleSort(&ctx, {k1_v, k2_v}, 0, hal::SortDirection::Ascending, 2);
 
-  EXPECT_TRUE(xt::allclose(sorted_k2, sorted_k2_hat, 0.01, 0.001))
-      << sorted_k2 << std::endl
-      << sorted_k2_hat << std::endl;
+        EXPECT_EQ(rets.size(), 2);
+
+        auto sorted_k1_hat =
+            hal::dump_public_as<float>(&ctx, hal::reveal(&ctx, rets[0]));
+        auto sorted_k2_hat =
+            hal::dump_public_as<float>(&ctx, hal::reveal(&ctx, rets[1]));
+
+        EXPECT_TRUE(xt::allclose(sorted_k1, sorted_k1_hat, 0.01, 0.001))
+            << sorted_k1 << std::endl
+            << sorted_k1_hat << std::endl;
+
+        EXPECT_TRUE(xt::allclose(sorted_k2, sorted_k2_hat, 0.01, 0.001))
+            << sorted_k2 << std::endl
+            << sorted_k2_hat << std::endl;
+      });
 }
 
+TEST_P(SimpleSortTest, SingleKeyWithPayload) {
+  size_t npc = std::get<0>(GetParam());
+  FieldType field = std::get<1>(GetParam());
+  ProtocolKind prot = std::get<2>(GetParam());
+  RuntimeConfig::SortMethod method = std::get<3>(GetParam());
+
+  mpc::utils::simulate(
+      npc, [&](const std::shared_ptr<yacl::link::Context> &lctx) {
+        RuntimeConfig cfg;
+        cfg.set_protocol(prot);
+        cfg.set_field(field);
+        cfg.set_enable_action_trace(false);
+        cfg.set_sort_method(method);
+        SPUContext ctx = test::makeSPUContext(cfg, lctx);
+
+        xt::xarray<float> k1 = {7, 6, 5, 4, 1, 3, 2};
+        xt::xarray<float> k2 = {1, 2, 3, 6, 7, 6, 5};
+
+        xt::xarray<float> sorted_k1 = {1, 2, 3, 4, 5, 6, 7};
+        xt::xarray<float> sorted_k2 = {7, 5, 6, 6, 3, 2, 1};
+
+        Value k1_v = test::makeValue(&ctx, k1, VIS_SECRET);
+        Value k2_v = test::makeValue(&ctx, k2, VIS_SECRET);
+
+        std::vector<spu::Value> rets =
+            SimpleSort(&ctx, {k1_v, k2_v}, 0, hal::SortDirection::Ascending, 1);
+
+        EXPECT_EQ(rets.size(), 2);
+
+        auto sorted_k1_hat =
+            hal::dump_public_as<float>(&ctx, hal::reveal(&ctx, rets[0]));
+        auto sorted_k2_hat =
+            hal::dump_public_as<float>(&ctx, hal::reveal(&ctx, rets[1]));
+
+        EXPECT_TRUE(xt::allclose(sorted_k1, sorted_k1_hat, 0.01, 0.001))
+            << sorted_k1 << std::endl
+            << sorted_k1_hat << std::endl;
+
+        EXPECT_TRUE(xt::allclose(sorted_k2, sorted_k2_hat, 0.01, 0.001))
+            << sorted_k2 << std::endl
+            << sorted_k2_hat << std::endl;
+      });
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    SimpleSort2PCTestInstances, SimpleSortTest,
+    testing::Combine(
+        testing::Values(2), testing::Values(FieldType::FM32, FieldType::FM64),
+        testing::Values(ProtocolKind::SEMI2K, ProtocolKind::CHEETAH),
+        testing::Values(RuntimeConfig::SORT_DEFAULT, RuntimeConfig::SORT_RADIX,
+                        RuntimeConfig::SORT_QUICK,
+                        RuntimeConfig::SORT_NETWORK)),
+    [](const testing::TestParamInfo<SimpleSortTest::ParamType> &p) {
+      return fmt::format("{}x{}x{}x{}", std::get<0>(p.param),
+                         std::get<1>(p.param), std::get<2>(p.param),
+                         std::get<3>(p.param));
+    });
+
+INSTANTIATE_TEST_SUITE_P(
+    SimpleSort3PCTestInstances, SimpleSortTest,
+    testing::Combine(testing::Values(3),
+                     testing::Values(FieldType::FM32, FieldType::FM64),
+                     testing::Values(ProtocolKind::SEMI2K, ProtocolKind::ABY3),
+                     testing::Values(RuntimeConfig::SORT_DEFAULT,
+                                     RuntimeConfig::SORT_RADIX,
+                                     RuntimeConfig::SORT_QUICK,
+                                     RuntimeConfig::SORT_NETWORK)),
+    [](const testing::TestParamInfo<SimpleSortTest::ParamType> &p) {
+      return fmt::format("{}x{}x{}x{}", std::get<0>(p.param),
+                         std::get<1>(p.param), std::get<2>(p.param),
+                         std::get<3>(p.param));
+    });
+
 }  // namespace spu::kernel::hlo
diff --git a/libspu/mpc/BUILD.bazel b/libspu/mpc/BUILD.bazel
index ffced4cb6..712284906 100644
--- a/libspu/mpc/BUILD.bazel
+++ b/libspu/mpc/BUILD.bazel
@@ -35,7 +35,7 @@ spu_cc_library(
         ":io_interface",
         "//libspu/mpc/utils:ring_ops",
         "//libspu/mpc/utils:simulate",
-        "@com_google_googletest//:gtest",
+        "@googletest//:gtest",
     ],
     alwayslink = True,
 )
@@ -96,7 +96,7 @@ spu_cc_library(
         "//libspu/mpc:kernel",
         "//libspu/mpc/common:communicator",
         "//libspu/mpc/utils:simulate",
-        "@com_google_googletest//:gtest",
+        "@googletest//:gtest",
     ],
     alwayslink = True,
 )
@@ -122,7 +122,7 @@ spu_cc_library(
         "//libspu/mpc:api_test_params",
         "//libspu/mpc/common:communicator",
         "//libspu/mpc/utils:simulate",
-        "@com_google_googletest//:gtest",
+        "@googletest//:gtest",
     ],
     alwayslink = True,
 )
diff --git a/libspu/mpc/aby3/boolean.cc b/libspu/mpc/aby3/boolean.cc
index 96033cc56..d83066a33 100644
--- a/libspu/mpc/aby3/boolean.cc
+++ b/libspu/mpc/aby3/boolean.cc
@@ -64,6 +64,36 @@ NdArrayRef CastTypeB::proc(KernelEvalContext*, const NdArrayRef& in,
   return out;
 }
 
+NdArrayRef RandB::proc(KernelEvalContext* ctx, const Shape& shape) const {
+  auto* prg_state = ctx->getState<PrgState>();
+  const auto field = ctx->getState<Z2kState>()->getDefaultField();
+
+  return DISPATCH_ALL_FIELDS(field, [&]() {
+    auto [r0, r1] =
+        prg_state->genPrssPair(field, shape, PrgState::GenPrssCtrl::Both);
+    // only rand bit is supported
+    const size_t nbits = 1;
+    const PtType btype = calcBShareBacktype(nbits);
+
+    NdArrayView<ring2k_t> _r0(r0);
+    NdArrayView<ring2k_t> _r1(r1);
+    return DISPATCH_UINT_PT_TYPES(btype, [&]() {
+      using bshr_el_t = ScalarT;
+      using bshr_t = std::array<bshr_el_t, 2>;
+
+      NdArrayRef out(makeType<BShrTy>(btype, nbits), shape);
+      NdArrayView<bshr_t> _out(out);
+
+      pforeach(0, shape.numel(), [&](int64_t idx) {
+        _out[idx][0] = static_cast<bshr_el_t>(_r0[idx] & 1);
+        _out[idx][1] = static_cast<bshr_el_t>(_r1[idx] & 1);
+      });
+
+      return out;
+    });
+  });
+}
+
 NdArrayRef B2P::proc(KernelEvalContext* ctx, const NdArrayRef& in) const {
   auto* comm = ctx->getState<Communicator>();
   const PtType btype = in.eltype().as<BShrTy>()->getBacktype();
diff --git a/libspu/mpc/aby3/boolean.h b/libspu/mpc/aby3/boolean.h
index dac53b108..b5a0d9584 100644
--- a/libspu/mpc/aby3/boolean.h
+++ b/libspu/mpc/aby3/boolean.h
@@ -43,6 +43,17 @@ class CastTypeB : public CastTypeKernel {
                   const Type& to_type) const override;
 };
 
+class RandB : public RandKernel {
+ public:
+  static constexpr const char* kBindName() { return "rand_b"; };
+
+  ce::CExpr latency() const override { return ce::Const(0); }
+
+  ce::CExpr comm() const override { return ce::Const(0); }
+
+  NdArrayRef proc(KernelEvalContext* ctx, const Shape& shape) const override;
+};
+
 class B2P : public UnaryKernel {
  public:
   static constexpr const char* kBindName() { return "b2p"; }
diff --git a/libspu/mpc/aby3/protocol.cc b/libspu/mpc/aby3/protocol.cc
index 9ca2e0199..e36a32732 100644
--- a/libspu/mpc/aby3/protocol.cc
+++ b/libspu/mpc/aby3/protocol.cc
@@ -67,7 +67,7 @@ void regAby3Protocol(SPUContext* ctx,
           aby3::XorBP, aby3::XorBB,                             // Xor
           aby3::BitrevB,                                        // bitreverse
           aby3::BitIntlB, aby3::BitDeintlB,  // bit(de)interleave
-          aby3::RandA,                       // rand
+          aby3::RandA, aby3::RandB,          // rand
 #ifdef ENABLE_PRECISE_ABY3_TRUNCPR
           aby3::TruncAPr,  // Trunc
 #else
diff --git a/libspu/mpc/api.cc b/libspu/mpc/api.cc
index 3404f00f6..b61a5d3ed 100644
--- a/libspu/mpc/api.cc
+++ b/libspu/mpc/api.cc
@@ -270,10 +270,14 @@ Value rand_p(SPUContext* ctx, const Shape& shape) {
   FORCE_DISPATCH(ctx, shape);
 }
 
-Value rand_s(SPUContext* ctx, const Shape& shape) {
+Value rand_s(SPUContext* ctx, const Shape& shape, DataType dtype) {
   SPU_TRACE_MPC_DISP(ctx, shape);
   TRY_DISPATCH(ctx, shape);
-  // always return random a share
+  // can only get random bit share now.
+  if (dtype == DT_I1) {
+    return rand_b(ctx, shape);
+  }
+  // else, return random a share
   return rand_a(ctx, shape);
 }
 
diff --git a/libspu/mpc/api.h b/libspu/mpc/api.h
index 50656f9ad..882bf1002 100644
--- a/libspu/mpc/api.h
+++ b/libspu/mpc/api.h
@@ -89,7 +89,7 @@ Value make_p(SPUContext* ctx, uint128_t init, const Shape& shape);
 
 // parties random a public together.
 Value rand_p(SPUContext* ctx, const Shape& shape);
-Value rand_s(SPUContext* ctx, const Shape& shape);
+Value rand_s(SPUContext* ctx, const Shape& shape, DataType dtype);
 
 // Compute bitwise not of a value.
 Value not_p(SPUContext* ctx, const Value& x);
diff --git a/libspu/mpc/cheetah/arith/BUILD.bazel b/libspu/mpc/cheetah/arith/BUILD.bazel
index 809f63cd8..a61726a0b 100644
--- a/libspu/mpc/cheetah/arith/BUILD.bazel
+++ b/libspu/mpc/cheetah/arith/BUILD.bazel
@@ -83,7 +83,7 @@ spu_cc_test(
     srcs = ["matmat_prot_test.cc"],
     deps = [
         ":matmat_prot",
-        "@com_github_xtensor_xtensor//:xtensor",
+        "@xtensor",
     ],
 )
 
@@ -94,7 +94,7 @@ spu_cc_test(
         ":cheetah_mul",
         "//libspu/mpc/utils:ring_ops",
         "//libspu/mpc/utils:simulate",
-        "@com_github_xtensor_xtensor//:xtensor",
+        "@xtensor",
     ],
 )
 
@@ -106,7 +106,7 @@ spu_cc_test(
         ":cheetah_dot",
         "//libspu/mpc/utils:ring_ops",
         "//libspu/mpc/utils:simulate",
-        "@com_github_xtensor_xtensor//:xtensor",
+        "@xtensor",
     ],
 )
 
diff --git a/libspu/mpc/cheetah/boolean.h b/libspu/mpc/cheetah/boolean.h
index c3799708e..26c318116 100644
--- a/libspu/mpc/cheetah/boolean.h
+++ b/libspu/mpc/cheetah/boolean.h
@@ -39,6 +39,17 @@ class CastTypeB : public CastTypeKernel {
                   const Type& to_type) const override;
 };
 
+class RandB : public RandKernel {
+ public:
+  static constexpr const char* kBindName() { return "rand_b"; };
+
+  ce::CExpr latency() const override { return ce::Const(0); }
+
+  ce::CExpr comm() const override { return ce::Const(0); }
+
+  NdArrayRef proc(KernelEvalContext* ctx, const Shape& shape) const override;
+};
+
 class B2P : public UnaryKernel {
  public:
   static constexpr const char* kBindName() { return "b2p"; }
diff --git a/libspu/mpc/cheetah/boolean_semi2k.cc b/libspu/mpc/cheetah/boolean_semi2k.cc
index c432e38ab..74c9c6e8a 100644
--- a/libspu/mpc/cheetah/boolean_semi2k.cc
+++ b/libspu/mpc/cheetah/boolean_semi2k.cc
@@ -61,6 +61,22 @@ NdArrayRef CastTypeB::proc(KernelEvalContext*, const NdArrayRef& in,
   return in.as(to_type);
 }
 
+NdArrayRef RandB::proc(KernelEvalContext* ctx, const Shape& shape) const {
+  auto* prg_state = ctx->getState<PrgState>();
+  const auto field = ctx->getState<Z2kState>()->getDefaultField();
+
+  return DISPATCH_ALL_FIELDS(field, [&]() {
+    auto r = prg_state->genPriv(field, shape);
+    // only rand bit is supported
+    const size_t nbits = 1;
+    NdArrayView<ring2k_t> _r(r);
+
+    pforeach(0, shape.numel(), [&](int64_t idx) { _r[idx] = _r[idx] & 1; });
+
+    return makeBShare(r, field, nbits);
+  });
+}
+
 NdArrayRef B2P::proc(KernelEvalContext* ctx, const NdArrayRef& in) const {
   const auto field = in.eltype().as<Ring2k>()->field();
   auto* comm = ctx->getState<Communicator>();
diff --git a/libspu/mpc/cheetah/conversion.cc b/libspu/mpc/cheetah/conversion.cc
index c8ec57b37..07ad6a4b8 100644
--- a/libspu/mpc/cheetah/conversion.cc
+++ b/libspu/mpc/cheetah/conversion.cc
@@ -59,6 +59,10 @@ NdArrayRef A2B::proc(KernelEvalContext* ctx, const NdArrayRef& x) const {
 
 NdArrayRef B2A::proc(KernelEvalContext* ctx, const NdArrayRef& x) const {
   const auto field = ctx->getState<Z2kState>()->getDefaultField();
+  const auto numel = x.numel();
+  if (numel == 0) {  // for empty input
+    return NdArrayRef(makeType<AShrTy>(field), x.shape());
+  }
   return TiledDispatchOTFunc(
              ctx, x,
              [&](const NdArrayRef& input,
diff --git a/libspu/mpc/cheetah/ot/BUILD.bazel b/libspu/mpc/cheetah/ot/BUILD.bazel
index 00a1dfc75..30af42854 100644
--- a/libspu/mpc/cheetah/ot/BUILD.bazel
+++ b/libspu/mpc/cheetah/ot/BUILD.bazel
@@ -29,11 +29,11 @@ spu_cc_library(
         "//libspu/core:ndarray_ref",
         "//libspu/core:prelude",
         "//libspu/mpc/common:communicator",
-        "@com_google_absl//absl/types:span",
+        "@abseil-cpp//absl/types:span",
         "@yacl//yacl/base:int128",
     ] + select({
         "@platforms//cpu:aarch64": [
-            "@com_github_dltcollab_sse2neon//:sse2neon",
+            "@sse2neon",
         ],
         "//conditions:default": [],
     }),
diff --git a/libspu/mpc/cheetah/ot/emp/BUILD.bazel b/libspu/mpc/cheetah/ot/emp/BUILD.bazel
index e4b753d90..05e1b3c7f 100644
--- a/libspu/mpc/cheetah/ot/emp/BUILD.bazel
+++ b/libspu/mpc/cheetah/ot/emp/BUILD.bazel
@@ -31,8 +31,8 @@ spu_cc_library(
         "//libspu/mpc/cheetah/ot:ferret_ot_interface",
         "//libspu/mpc/cheetah/ot:ot_util",
         "//libspu/mpc/common:communicator",
-        "@com_github_emptoolkit_emp_ot//:emp-ot",
-        "@com_github_emptoolkit_emp_tool//:emp-tool",
+        "@emp-ot//:emp-ot",
+        "@emp-tool//:emp-tool",
         "@yacl//yacl/base:int128",
         "@yacl//yacl/link",
     ],
diff --git a/libspu/mpc/cheetah/protocol.cc b/libspu/mpc/cheetah/protocol.cc
index 8b638d57c..407e6ca2d 100644
--- a/libspu/mpc/cheetah/protocol.cc
+++ b/libspu/mpc/cheetah/protocol.cc
@@ -79,7 +79,7 @@ void regCheetahProtocol(SPUContext* ctx,
                   cheetah::CommonTypeB, cheetah::CommonTypeV,               //
                   cheetah::CastTypeB, cheetah::AndBP, cheetah::AndBB,       //
                   cheetah::XorBP, cheetah::XorBB,                           //
-                  cheetah::RandA>();
+                  cheetah::RandA, cheetah::RandB>();
 }
 
 std::unique_ptr<SPUContext> makeCheetahProtocol(
diff --git a/libspu/mpc/cheetah/rlwe/BUILD.bazel b/libspu/mpc/cheetah/rlwe/BUILD.bazel
index 5b08672f0..e97d0b932 100644
--- a/libspu/mpc/cheetah/rlwe/BUILD.bazel
+++ b/libspu/mpc/cheetah/rlwe/BUILD.bazel
@@ -62,7 +62,7 @@ spu_cc_library(
     ],
     deps = [
         "//libspu/mpc/utils:ring_ops",
-        "@com_github_microsoft_seal//:seal",
+        "@seal",
     ],
 )
 
diff --git a/libspu/mpc/common/communicator.cc b/libspu/mpc/common/communicator.cc
index b7dc4089f..315649139 100644
--- a/libspu/mpc/common/communicator.cc
+++ b/libspu/mpc/common/communicator.cc
@@ -147,17 +147,29 @@ std::vector<NdArrayRef> Communicator::gather(const NdArrayRef& in, size_t root,
 }
 
 NdArrayRef Communicator::broadcast(const NdArrayRef& in, size_t root,
+                                   const Type& eltype, const Shape& shape,
                                    std::string_view tag) {
-  const auto array = getOrCreateCompactArray(in);
-  yacl::ByteContainerView bv(reinterpret_cast<uint8_t const*>(array.data()),
-                             array.elsize() * array.numel());
-  auto buf = yacl::link::Broadcast(lctx_, bv, root, tag);
-
   stats_.latency += 1;
   stats_.comm += in.elsize() * in.numel();
 
-  return NdArrayRef(stealBuffer(std::move(buf)), in.eltype(), in.shape(),
-                    makeCompactStrides(in.shape()), kOffset);
+  yacl::Buffer buf;
+  if (lctx_->Rank() == root) {
+    const auto array = getOrCreateCompactArray(in);
+    yacl::ByteContainerView bv(reinterpret_cast<uint8_t const*>(array.data()),
+                               array.elsize() * array.numel());
+    auto buf = yacl::link::Broadcast(lctx_, bv, root, tag);
+    return NdArrayRef(stealBuffer(std::move(buf)), in.eltype(), in.shape(),
+                      makeCompactStrides(in.shape()), kOffset);
+  } else {
+    // for yacl::link::Broadcast need a legal ByteContainerView
+    // But the data is not actually used
+    std::array<uint8_t, 1> dummy;
+    auto buf = yacl::link::Broadcast(lctx_, dummy, root, tag);
+    SPU_ENFORCE(static_cast<size_t>(buf.size()) ==
+                shape.numel() * eltype.size());
+    return NdArrayRef(stealBuffer(std::move(buf)), eltype, shape,
+                      makeCompactStrides(shape), kOffset);
+  }
 }
 
 void Communicator::sendAsync(size_t dst_rank, const NdArrayRef& in,
diff --git a/libspu/mpc/common/communicator.h b/libspu/mpc/common/communicator.h
index f61039376..44a383717 100644
--- a/libspu/mpc/common/communicator.h
+++ b/libspu/mpc/common/communicator.h
@@ -106,7 +106,8 @@ class Communicator : public State {
   std::vector<NdArrayRef> gather(const NdArrayRef& in, size_t root,
                                  std::string_view tag);
 
-  NdArrayRef broadcast(const NdArrayRef& in, size_t root, std::string_view tag);
+  NdArrayRef broadcast(const NdArrayRef& in, size_t root, const Type& eltype,
+                       const Shape& shape, std::string_view tag);
 
   NdArrayRef reduce(ReduceOp op, const NdArrayRef& in, size_t root,
                     std::string_view tag);
diff --git a/libspu/mpc/kernel.cc b/libspu/mpc/kernel.cc
index 56b5c1bb0..628f949c3 100644
--- a/libspu/mpc/kernel.cc
+++ b/libspu/mpc/kernel.cc
@@ -275,4 +275,16 @@ void OramReadKernel::evaluate(KernelEvalContext* ctx) const {
       WrapValue(proc(ctx, UnwrapValue(onehot), UnwrapValue(db), offset)));
 }
 
+void MultiKeyLowMcKernel::evaluate(KernelEvalContext* ctx) const {
+  const auto& in = ctx->getParam<std::vector<Value>>(0);
+  std::vector<NdArrayRef> inputs;
+  inputs.reserve(in.size());
+  for (const auto& item : in) {
+    inputs.push_back(UnwrapValue(item));
+  }
+  auto y = proc(ctx, inputs);
+
+  ctx->pushOutput(WrapValue(y));
+}
+
 }  // namespace spu::mpc
diff --git a/libspu/mpc/kernel.h b/libspu/mpc/kernel.h
index d75b3426a..90c629dde 100644
--- a/libspu/mpc/kernel.h
+++ b/libspu/mpc/kernel.h
@@ -225,4 +225,12 @@ class DisassembleKernel : public Kernel {
                                        const NdArrayRef& in) const = 0;
 };
 
+class MultiKeyLowMcKernel : public Kernel {
+ public:
+  void evaluate(KernelEvalContext* ctx) const override;
+
+  virtual NdArrayRef proc(KernelEvalContext* ctx,
+                          const std::vector<NdArrayRef>& inputs) const = 0;
+};
+
 }  // namespace spu::mpc
diff --git a/libspu/mpc/semi2k/BUILD.bazel b/libspu/mpc/semi2k/BUILD.bazel
index dfef1ec7d..661b0f329 100644
--- a/libspu/mpc/semi2k/BUILD.bazel
+++ b/libspu/mpc/semi2k/BUILD.bazel
@@ -112,6 +112,7 @@ spu_cc_library(
         ":boolean",
         ":conversion",
         ":exp",
+        ":lowmc",
         ":permute",
         ":state",
         "//libspu/mpc/common:prg_state",
@@ -130,6 +131,8 @@ spu_cc_test(
         "//libspu/mpc:ab_api_test",
         "//libspu/mpc:api_test",
         "//libspu/mpc/semi2k/beaver/beaver_impl/ttp_server:beaver_server",
+        "//libspu/mpc/utils:lowmc",
+        "@yacl//yacl/utils:elapsed_timer",
     ],
 )
 
@@ -184,3 +187,17 @@ spu_cc_library(
         "//libspu/mpc/utils:ring_ops",
     ],
 )
+
+spu_cc_library(
+    name = "lowmc",
+    srcs = ["lowmc.cc"],
+    hdrs = ["lowmc.h"],
+    deps = [
+        ":type",
+        "//libspu/mpc:ab_api",
+        "//libspu/mpc:kernel",
+        "//libspu/mpc/common:prg_state",
+        "//libspu/mpc/common:pv2k",
+        "//libspu/mpc/utils:lowmc",
+    ],
+)
diff --git a/libspu/mpc/semi2k/beaver/BUILD.bazel b/libspu/mpc/semi2k/beaver/BUILD.bazel
index 8fab06b2b..84eb3112a 100644
--- a/libspu/mpc/semi2k/beaver/BUILD.bazel
+++ b/libspu/mpc/semi2k/beaver/BUILD.bazel
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("//bazel:spu.bzl", "spu_cc_library", "spu_cc_test")
+load("//bazel:spu.bzl", "spu_cc_library")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -25,7 +25,7 @@ spu_cc_library(
         "//libspu/core:ndarray_ref",
         "//libspu/core:shape",
         "//libspu/mpc/common:prg_tensor",
-        "@com_github_google_leveldb//:leveldb",
+        "@leveldb",
     ],
 )
 
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/BUILD.bazel b/libspu/mpc/semi2k/beaver/beaver_impl/BUILD.bazel
index 5f0bd1e12..03f1e8702 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/BUILD.bazel
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/BUILD.bazel
@@ -24,10 +24,10 @@ spu_cc_library(
         "//libspu/mpc/common:prg_tensor",
         "//libspu/mpc/semi2k/beaver:beaver_interface",
         "//libspu/mpc/semi2k/beaver/beaver_impl/trusted_party",
-        "//libspu/mpc/semi2k/beaver/beaver_impl/ttp_server:beaver_stream",
         "//libspu/mpc/utils:gfmp_ops",
+        "//libspu/mpc/utils:permute",
         "//libspu/mpc/utils:ring_ops",
-        "@com_github_microsoft_seal//:seal",
+        "@seal",
         "@yacl//yacl/link",
         "@yacl//yacl/utils:parallel",
     ],
@@ -44,7 +44,7 @@ spu_cc_test(
         "//libspu/mpc/utils:gfmp",
         "//libspu/mpc/utils:permute",
         "//libspu/mpc/utils:simulate",
-        "@com_google_googletest//:gtest",
+        "@googletest//:gtest",
     ],
 )
 
@@ -55,9 +55,9 @@ spu_cc_library(
     deps = [
         "//libspu/mpc/common:prg_tensor",
         "//libspu/mpc/semi2k/beaver:beaver_interface",
-        "//libspu/mpc/semi2k/beaver/beaver_impl/ttp_server:beaver_stream",
         "//libspu/mpc/semi2k/beaver/beaver_impl/ttp_server:service_cc_proto",
         "//libspu/mpc/utils:gfmp_ops",
+        "//libspu/mpc/utils:permute",
         "//libspu/mpc/utils:ring_ops",
         "@yacl//yacl/crypto/pke:sm2_enc",
         "@yacl//yacl/link",
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/beaver_test.cc b/libspu/mpc/semi2k/beaver/beaver_impl/beaver_test.cc
index 300a2f6e7..425fd4fe6 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/beaver_test.cc
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/beaver_test.cc
@@ -1093,22 +1093,20 @@ TEST_P(BeaverTest, PermPair) {
   const size_t kWorldSize = std::get<1>(GetParam());
   const FieldType kField = std::get<2>(GetParam());
   const size_t adjust_rank = std::get<4>(GetParam());
-  const int64_t kNumel = 10;
-  std::random_device rd;
-  uint128_t seed = rd();
-  uint64_t ctr = rd();
-  const auto r_perm = genRandomPerm(kNumel, seed, &ctr);
+  const int64_t kNumel = 666 * 1024 + 1;
 
   for (size_t r = 0; r < kWorldSize; ++r) {
-    std::vector<Pair> pairs(kWorldSize);
+    std::vector<Beaver::Pair> pairs(kWorldSize);
+    Index perm;
     utils::simulate(
         kWorldSize, [&](const std::shared_ptr<yacl::link::Context>& lctx) {
           auto beaver = factory(lctx, ttp_options_, adjust_rank);
           auto rank = lctx->Rank();
+          auto PermPair = beaver->PermPair(kField, kNumel, r);
+          pairs[lctx->Rank()].first = std::move(std::get<0>(PermPair));
+          pairs[lctx->Rank()].second = std::move(std::get<1>(PermPair));
           if (rank == r) {
-            pairs[lctx->Rank()] = beaver->PermPair(kField, kNumel, r, r_perm);
-          } else {
-            pairs[lctx->Rank()] = beaver->PermPair(kField, kNumel, r, {});
+            perm = std::move(std::get<2>(PermPair));
           }
           yacl::link::Barrier(lctx, "BeaverUT");
         });
@@ -1116,7 +1114,7 @@ TEST_P(BeaverTest, PermPair) {
     EXPECT_EQ(pairs.size(), kWorldSize);
     auto open = open_buffer(pairs, kField, std::vector<Shape>(2, {kNumel}),
                             kWorldSize, true);
-    EXPECT_TRUE(ring_all_equal(applyInvPerm(open[0], r_perm), open[1], 0));
+    EXPECT_TRUE(ring_all_equal(applyInvPerm(open[0], perm), open[1], 0));
   }
 }
 
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/beaver_tfp.cc b/libspu/mpc/semi2k/beaver/beaver_impl/beaver_tfp.cc
index f876209d2..a8d19e414 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/beaver_tfp.cc
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/beaver_tfp.cc
@@ -18,12 +18,14 @@
 #include <utility>
 
 #include "yacl/crypto/rand/rand.h"
+#include "yacl/link/algorithm/broadcast.h"
 #include "yacl/link/algorithm/gather.h"
 #include "yacl/utils/serialize.h"
 
 #include "libspu/mpc/common/prg_tensor.h"
 #include "libspu/mpc/semi2k/beaver/beaver_impl/trusted_party/trusted_party.h"
 #include "libspu/mpc/utils/gfmp_ops.h"
+#include "libspu/mpc/utils/permute.h"
 #include "libspu/mpc/utils/ring_ops.h"
 
 namespace spu::mpc::semi2k {
@@ -373,40 +375,46 @@ BeaverTfpUnsafe::Array BeaverTfpUnsafe::RandBit(FieldType field, int64_t size) {
   return std::move(*a.buf());
 }
 
-BeaverTfpUnsafe::Pair BeaverTfpUnsafe::PermPair(
-    FieldType field, int64_t size, size_t perm_rank,
-    absl::Span<const int64_t> perm_vec) {
+BeaverTfpUnsafe::PremTriple BeaverTfpUnsafe::PermPair(FieldType field,
+                                                      int64_t size,
+                                                      size_t perm_rank) {
   constexpr char kTag[] = "BEAVER_TFP:PERM";
+  SPU_ENFORCE(perm_rank < lctx_->WorldSize(), "TODO");
 
   std::vector<TrustedParty::Operand> ops(2);
   Shape shape({size});
 
   auto a = prgCreateArray(field, shape, seed_, &counter_, &ops[0].desc);
   auto b = prgCreateArray(field, shape, seed_, &counter_, &ops[1].desc);
+  Index pi;
+
+  if (lctx_->Rank() == perm_rank) {
+    pi = genRandomPerm(size, seed_, &counter_);
+  }
 
   if (lctx_->Rank() == 0) {
     for (auto& op : ops) {
       op.seeds = seeds_;
     }
-    if (perm_rank != lctx_->Rank()) {
-      auto pv_buf = lctx_->Recv(perm_rank, kTag);
-
-      ring_add_(b, TrustedParty::adjustPerm(
-                       absl::MakeSpan(ops),
-                       absl::MakeSpan(pv_buf.data<int64_t>(),
-                                      pv_buf.size() / sizeof(int64_t))));
+    if (perm_rank != 0) {
+      auto pi = genRandomPerm(size, seeds_[perm_rank], &counter_);
+      ring_add_(b, TrustedParty::adjustPerm(absl::MakeSpan(ops), pi));
     } else {
-      ring_add_(b, TrustedParty::adjustPerm(absl::MakeSpan(ops), perm_vec));
+      ring_add_(b, TrustedParty::adjustPerm(absl::MakeSpan(ops), pi));
     }
-  } else if (perm_rank == lctx_->Rank()) {
-    lctx_->SendAsync(
-        0, yacl::Buffer(perm_vec.data(), perm_vec.size() * sizeof(int64_t)),
-        kTag);
   }
 
-  Pair ret;
-  ret.first = std::move(*a.buf());
-  ret.second = std::move(*b.buf());
+  auto new_counter_buf = yacl::link::Broadcast(
+      lctx_, yacl::SerializeVars<PrgCounter>(counter_), perm_rank, kTag);
+
+  counter_ = yacl::DeserializeVars<PrgCounter>(new_counter_buf);
+
+  PremTriple ret;
+  std::get<0>(ret) = std::move(*a.buf());
+  std::get<1>(ret) = std::move(*b.buf());
+  if (lctx_->Rank() == perm_rank) {
+    std::get<2>(ret) = std::move(pi);
+  }
 
   return ret;
 }
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/beaver_tfp.h b/libspu/mpc/semi2k/beaver/beaver_impl/beaver_tfp.h
index 2f26a7168..bd618dd98 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/beaver_tfp.h
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/beaver_tfp.h
@@ -66,8 +66,7 @@ class BeaverTfpUnsafe final : public Beaver {
 
   Array RandBit(FieldType field, int64_t size) override;
 
-  Pair PermPair(FieldType field, int64_t size, size_t perm_rank,
-                absl::Span<const int64_t> perm_vec) override;
+  PremTriple PermPair(FieldType field, int64_t size, size_t perm_rank) override;
 
   std::unique_ptr<Beaver> Spawn() override;
 
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/beaver_ttp.cc b/libspu/mpc/semi2k/beaver/beaver_impl/beaver_ttp.cc
index be2e9e866..496f272cf 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/beaver_ttp.cc
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/beaver_ttp.cc
@@ -14,19 +14,26 @@
 
 #include "libspu/mpc/semi2k/beaver/beaver_impl/beaver_ttp.h"
 
+#include <condition_variable>
 #include <future>
+#include <mutex>
 #include <utility>
 #include <vector>
 
+#include "brpc/progressive_reader.h"
 #include "yacl/crypto/pke/sm2_enc.h"
 #include "yacl/crypto/rand/rand.h"
 #include "yacl/link/algorithm/allgather.h"
+#include "yacl/link/algorithm/broadcast.h"
+#include "yacl/utils/serialize.h"
 
 #include "libspu/mpc/common/prg_tensor.h"
-#include "libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_stream.h"
 #include "libspu/mpc/utils/gfmp_ops.h"
+#include "libspu/mpc/utils/permute.h"
 #include "libspu/mpc/utils/ring_ops.h"
 
+#include "libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/service.pb.h"
+
 namespace brpc {
 
 DECLARE_uint64(max_body_size);
@@ -63,7 +70,7 @@ AdjustRequest BuildAdjustRequest(
 
   SPU_ENFORCE(!descs.empty());
 
-  uint32_t field_size;
+  uint32_t field_size = 0;
   ElementType eltype = ElementType::kRing;
 
   for (size_t i = 0; i < descs.size(); i++) {
@@ -104,136 +111,205 @@ AdjustRequest BuildAdjustRequest(
 template <class T>
 struct dependent_false : std::false_type {};
 
-class StreamReader : public brpc::StreamInputHandler {
+// Obtain a tuple containing num_buf and buf_len
+template <class AdjustRequest>
+std::tuple<int32_t, int64_t> GetBufferLength(const AdjustRequest& req) {
+  if constexpr (std::is_same_v<AdjustRequest,
+                               beaver::ttp_server::AdjustDotRequest>) {
+    SPU_ENFORCE_EQ(req.prg_inputs().size(), 3);
+    return {1, req.prg_inputs()[2].buffer_len()};
+  } else if constexpr (std::is_same_v<
+                           AdjustRequest,
+                           beaver::ttp_server::AdjustTruncPrRequest>) {
+    SPU_ENFORCE_GE(req.prg_inputs().size(), 1);
+    return {2, req.prg_inputs()[0].buffer_len()};
+  } else {
+    SPU_ENFORCE_GE(req.prg_inputs().size(), 1);
+    return {1, req.prg_inputs()[0].buffer_len()};
+  }
+}
+
+class ProgressiveReader : public brpc::ProgressiveReader {
  public:
-  enum class Status : int8_t {
-    kNotFinished,
-    kNormalFinished,
-    kAbnormalFinished,
-    kStreamFailed,
-  };
+  ProgressiveReader(int32_t num_buf, int64_t buf_len)
+      : buffer_remain_size_(buf_len * num_buf),
 
-  StreamReader(int32_t num_buf, size_t buf_len) {
-    SPU_ENFORCE(num_buf > 0);
-    SPU_ENFORCE(buf_len > 0);
-    buf_vec_.resize(num_buf);
-    buf_len_ = buf_len;
-    future_finished_ = promise_finished_.get_future();
-    future_closed_ = promise_closed_.get_future();
+        receive_buffers_(num_buf) {
+    for (auto& b : receive_buffers_) {
+      b.resize(buf_len);
+    }
   }
 
-  int on_received_messages(brpc::StreamId id, butil::IOBuf* const messages[],
-                           size_t size) override {
-    SPDLOG_DEBUG("on_received_messages, stream id: {}", id);
-    for (size_t i = 0; i < size; ++i) {
-      if (status_ != Status::kNotFinished) {
-        SPDLOG_ERROR("unexpected messages received");
-        return -1;
+  butil::Status OnReadOnePart(const void* data, size_t length) final {
+    size_t consumed = 0;
+    try {
+      while (consumed < length) {
+        const auto* consume_data =
+            reinterpret_cast<const uint8_t*>(data) + consumed;
+        size_t remain_length = length - consumed;
+
+        if (current_state_ == ReadFlags) {
+          consumed += copy_to_flags(consume_data, remain_length);
+        } else if (current_state_ == ReadChunk) {
+          consumed += copy_to_buffer(consume_data, remain_length);
+        } else if (current_state_ == ReadError) {
+          consumed += copy_to_error(consume_data, remain_length);
+        } else if (current_state_ == End) {
+          return butil::Status(
+              -1, "response size mismatch, receive data after end");
+        }
       }
-
-      SPDLOG_DEBUG("receive buf size: {}", messages[i]->size());
-      const auto& message = messages[i];
-      beaver::ttp_server::BeaverDownStreamMeta meta;
-      message->copy_to(&meta, sizeof(meta));
-      message->pop_front(sizeof(meta));
-      if (meta.err_code != 0) {
-        SPDLOG_ERROR("response error from server, err_code: {}, err_text: {}",
-                     meta.err_code, message->to_string());
-        status_ = Status::kAbnormalFinished;
-        promise_finished_.set_value(status_);
-        return -2;
+      if (current_state_ == End && !server_error_msg_.empty()) {
+        return butil::Status(
+            -1,
+            fmt::format("server side error code {}, msg {}",
+                        beaver::ttp_server::ErrorCode_Name(server_error_code_),
+                        server_error_msg_));
       }
+    } catch (const std::exception& e) {
+      return butil::Status(-1, fmt::format("unexpected error {}", e.what()));
+    }
 
-      SPU_ENFORCE(message->length() % buf_vec_.size() == 0);
-      size_t msg_len = message->length() / buf_vec_.size();
-      for (size_t buf_idx = 0; buf_idx < buf_vec_.size(); ++buf_idx) {
-        message->append_to(&buf_vec_[buf_idx], msg_len, buf_idx * msg_len);
-      }
+    return butil::Status::OK();
+  }
 
-      SPU_ENFORCE(buf_vec_[0].length() <= buf_len_,
-                  "unexpected bytes received");
-      if (buf_vec_[0].length() == buf_len_) {
-        status_ = Status::kNormalFinished;
-        promise_finished_.set_value(status_);
+  void OnEndOfMessage(const butil::Status& status) final {
+    {
+      std::lock_guard lk(lock_);
+      if (current_state_ == End) {
+        // received all data.
+        read_status_ = status;
+      } else if (status.ok()) {
+        // rpc streaming finished, but we expected more data
+        read_status_ =
+            butil::Status(-1, "response size mismatch, need more data");
+      } else {
+        // some error happend in network or OnReadOnePart
+        read_status_ = status;
       }
     }
-    return 0;
+    cond_.notify_all();
   }
 
-  void on_idle_timeout(brpc::StreamId id) override {
-    SPDLOG_WARN("Stream {} idle timeout", id);
+  void Wait() {
+    {
+      std::unique_lock lk(lock_);
+      cond_.wait(lk, [this] { return read_status_.has_value(); });
+    }
+    SPU_ENFORCE(read_status_->ok(), "Beaver Streaming data err: {}",
+                read_status_->error_str());
   }
 
-  void on_closed(brpc::StreamId id) override {
-    SPDLOG_DEBUG("Stream {} closed", id);
-    promise_closed_.set_value();
+  std::vector<yacl::Buffer> PopBuffer() {
+    {
+      std::lock_guard lk(lock_);
+      SPU_ENFORCE(current_state_ == End, "pls wait streaming finished");
+    }
+    return std::move(receive_buffers_);
   }
 
-  void on_failed(brpc::StreamId id, int error_code,
-                 const std::string& error_text) override {
-    SPDLOG_ERROR("Stream {} failed, error_code: {}, error_text: {}", id,
-                 error_code, error_text);
-    status_ = Status::kStreamFailed;
-    promise_finished_.set_value(status_);
+ private:
+  size_t copy_to_flags(const void* data, size_t length) {
+    size_t cp_size = std::min(flags_.size() - flags_pos_, length);
+    std::memcpy(flags_.data() + flags_pos_, data, cp_size);
+    flags_pos_ += cp_size;
+    if (flags_pos_ == flags_.size()) {
+      flags_pos_ = 0;
+      int64_t chunk_size = 0;
+      std::memcpy(&chunk_size, &flags_[1], sizeof(int64_t));
+      chunk_remain_size_ = chunk_size;
+      if (flags_[0] == 0) {
+        current_state_ = ReadChunk;
+      } else if (beaver::ttp_server::ErrorCode_IsValid(flags_[0])) {
+        server_error_code_ =
+            static_cast<beaver::ttp_server::ErrorCode>(flags_[0]);
+        current_state_ = ReadError;
+      } else {
+        SPU_THROW("unexpected flags[0] {}", flags_[0]);
+      }
+    }
+
+    return cp_size;
   }
 
-  const auto& GetBufVecRef() const {
-    SPU_ENFORCE(status_ == Status::kNormalFinished);
-    return buf_vec_;
+  size_t copy_to_buffer(const void* data, size_t length) {
+    length = std::min(length, chunk_remain_size_);
+    chunk_remain_size_ -= length;
+    if (chunk_remain_size_ == 0) {
+      current_state_ = ReadFlags;
+    }
+
+    if (length > buffer_remain_size_) {
+      SPU_THROW("response size mismatch, too many data for buffer");
+    }
+
+    buffer_remain_size_ -= length;
+    if (buffer_remain_size_ == 0) {
+      current_state_ = End;
+    }
+
+    size_t data_pos = 0;
+    while (data_pos < length) {
+      if (current_buffer_idx_ >= receive_buffers_.size()) {
+        SPU_THROW("response size mismatch, outof index");
+      }
+      auto& buffer = receive_buffers_[current_buffer_idx_];
+      auto cp_size = std::min(length, buffer.size() - current_buffer_pos_);
+      std::memcpy(buffer.data<uint8_t>() + current_buffer_pos_,
+                  reinterpret_cast<const uint8_t*>(data) + data_pos, cp_size);
+      current_buffer_pos_ += cp_size;
+      if (current_buffer_pos_ == static_cast<size_t>(buffer.size())) {
+        current_buffer_pos_ = 0;
+        current_buffer_idx_ += 1;
+      }
+      data_pos += cp_size;
+    }
+
+    return length;
   }
 
-  Status WaitFinished() { return future_finished_.get(); };
+  size_t copy_to_error(const void* data, size_t length) {
+    length = std::min(length, chunk_remain_size_);
+    chunk_remain_size_ -= length;
+    if (chunk_remain_size_ == 0) {
+      current_state_ = End;
+    }
 
-  void WaitClosed() { future_closed_.wait(); }
+    server_error_msg_.append(reinterpret_cast<const char*>(data), length);
+    return length;
+  }
 
  private:
-  std::vector<butil::IOBuf> buf_vec_;
-  size_t buf_len_;
-  Status status_ = Status::kNotFinished;
-  std::promise<Status> promise_finished_;
-  std::promise<void> promise_closed_;
-  std::future<Status> future_finished_;
-  std::future<void> future_closed_;
+  enum State : uint8_t {
+    ReadFlags = 0,
+    ReadChunk = 1,
+    ReadError = 2,
+    End = 3,
+  };
+  State current_state_{ReadFlags};
+  size_t flags_pos_{};
+  std::array<uint8_t, 1 + sizeof(int64_t)> flags_;
+  size_t chunk_remain_size_{};
+  std::string server_error_msg_;
+  beaver::ttp_server::ErrorCode server_error_code_;
+
+  size_t buffer_remain_size_;
+  size_t current_buffer_idx_{};
+  size_t current_buffer_pos_{};
+  std::vector<yacl::Buffer> receive_buffers_;
+
+  std::mutex lock_;
+  std::condition_variable cond_;
+  std::optional<butil::Status> read_status_;
 };
 
-// Obtain a tuple containing num_buf and buf_len
-template <class AdjustRequest>
-std::tuple<int32_t, int64_t> GetBufferLength(const AdjustRequest& req) {
-  if constexpr (std::is_same_v<AdjustRequest,
-                               beaver::ttp_server::AdjustDotRequest>) {
-    SPU_ENFORCE_EQ(req.prg_inputs().size(), 3);
-    return {1, req.prg_inputs()[2].buffer_len()};
-  } else if constexpr (std::is_same_v<
-                           AdjustRequest,
-                           beaver::ttp_server::AdjustTruncPrRequest>) {
-    SPU_ENFORCE_GE(req.prg_inputs().size(), 1);
-    return {2, req.prg_inputs()[0].buffer_len()};
-  } else {
-    SPU_ENFORCE_GE(req.prg_inputs().size(), 1);
-    return {1, req.prg_inputs()[0].buffer_len()};
-  }
-}
-
 template <class AdjustRequest>
-std::vector<NdArrayRef> RpcCall(
-    brpc::Channel& channel, AdjustRequest req, FieldType ret_field,
-    const std::vector<butil::IOBuf>* upstream_messages = nullptr) {
-  brpc::Controller cntl;
+std::vector<NdArrayRef> RpcCall(brpc::Channel& channel,
+                                const AdjustRequest& req, FieldType ret_field) {
   beaver::ttp_server::BeaverService::Stub stub(&channel);
   beaver::ttp_server::AdjustResponse rsp;
-
-  auto [num_buf, buf_len] = GetBufferLength(req);
-  StreamReader reader(num_buf, buf_len);
-  brpc::StreamOptions stream_options;
-  stream_options.max_buf_size = 2 * beaver::ttp_server::kUpStreamChunkSize;
-  stream_options.handler = &reader;
-  brpc::StreamId stream_id;
-  SPU_ENFORCE_EQ(brpc::StreamCreate(&stream_id, cntl, &stream_options), 0,
-                 "Failed to create stream");
-  auto cleanup = absl::MakeCleanup([&stream_id, &reader]() {
-    SPU_ENFORCE(brpc::StreamClose(stream_id) == 0);
-    reader.WaitClosed();
-  });
+  brpc::Controller cntl;
+  cntl.response_will_be_read_progressively();
 
   if constexpr (std::is_same_v<AdjustRequest,
                                beaver::ttp_server::AdjustMulRequest>) {
@@ -276,35 +352,21 @@ std::vector<NdArrayRef> RpcCall(
 
   SPU_ENFORCE(!cntl.Failed(), "Adjust RpcCall failed, code={} error={}",
               cntl.ErrorCode(), cntl.ErrorText());
-  SPU_ENFORCE(rsp.code() == beaver::ttp_server::ErrorCode::OK,
-              "Adjust server failed code={}, error={}",
-              ErrorCode_Name(rsp.code()), rsp.message());
-
-  if (upstream_messages != nullptr) {
-    for (const auto& message : *upstream_messages) {
-      int ret = brpc::StreamWrite(stream_id, message);
-      if (ret == EAGAIN) {
-        SPU_ENFORCE_EQ(brpc::StreamWait(stream_id, nullptr), 0);
-        ret = brpc::StreamWrite(stream_id, message);
-      }
-      SPU_ENFORCE_EQ(ret, 0, "Write stream failed");
-      SPDLOG_DEBUG("write buf size {} to stream id {}", message.length(),
-                   stream_id);
-    }
-  }
 
-  auto status = reader.WaitFinished();
-  SPU_ENFORCE(status == StreamReader::Status::kNormalFinished,
-              "Stream reader finished abnormally, status: {}",
-              static_cast<int32_t>(status));
+  auto [num_buf, buf_len] = GetBufferLength(req);
+  ProgressiveReader reader(num_buf, buf_len);
+  cntl.ReadProgressiveAttachmentBy(&reader);
+  reader.Wait();
+  auto buffers = reader.PopBuffer();
+
   std::vector<NdArrayRef> ret;
-  for (const auto& buf : reader.GetBufVecRef()) {
-    SPU_ENFORCE(buf.length() % SizeOf(ret_field) == 0);
-    int64_t size = buf.length() / SizeOf(ret_field);
+  for (auto& buf : buffers) {
+    SPU_ENFORCE(buf.size() % SizeOf(ret_field) == 0);
+    int64_t size = buf.size() / SizeOf(ret_field);
     // FIXME: change beaver interface: change return type to buffer.
-    NdArrayRef array(makeType<RingTy>(ret_field), {size});
     // FIXME: TTP adjuster server and client MUST have same endianness.
-    buf.copy_to(array.data());
+    NdArrayRef array(std::make_shared<yacl::Buffer>(std::move(buf)),
+                     makeType<RingTy>(ret_field), {size});
     ret.push_back(std::move(array));
   }
 
@@ -325,15 +387,18 @@ BeaverTtp::BeaverTtp(std::shared_ptr<yacl::link::Context> lctx, Options ops)
   SPU_ENFORCE(lctx_);
   {
     brpc::ChannelOptions brc_options;
+    SPU_ENFORCE(options_.brpc_channel_protocol == "http" ||
+                    options_.brpc_channel_protocol == "h2",
+                "beaver only support http 1.1 or http 2");
     brc_options.protocol = options_.brpc_channel_protocol;
-    brc_options.connection_type = options_.brpc_channel_connection_type;
     brc_options.timeout_ms = options_.brpc_timeout_ms;
     brc_options.max_retry = options_.brpc_max_retry;
-    // TODO TLS
 
-    if (channel_.Init(options_.server_host.c_str(),
-                      options_.brpc_load_balancer_name.c_str(),
-                      &brc_options) != 0) {
+    if (options_.brpc_ssl_options) {
+      *brc_options.mutable_ssl_options() = options_.brpc_ssl_options.value();
+    }
+
+    if (channel_.Init(options_.server_host.c_str(), &brc_options) != 0) {
       SPU_THROW("Fail to initialize channel for BeaverTtp, server_host {}",
                 options_.server_host);
     }
@@ -654,9 +719,9 @@ BeaverTtp::Array BeaverTtp::RandBit(FieldType field, int64_t size) {
   return std::move(*a.buf());
 }
 
-BeaverTtp::Pair BeaverTtp::PermPair(FieldType field, int64_t size,
-                                    size_t perm_rank,
-                                    absl::Span<const int64_t> perm_vec) {
+BeaverTtp::PremTriple BeaverTtp::PermPair(FieldType field, int64_t size,
+                                          size_t perm_rank) {
+  constexpr char kTag[] = "BEAVER_TFP:PERM";
   std::vector<PrgArrayDesc> descs(2);
   std::vector<absl::Span<const PrgSeedBuff>> descs_seed(1, encrypted_seeds_);
   Shape shape({size, 1});
@@ -664,30 +729,34 @@ BeaverTtp::Pair BeaverTtp::PermPair(FieldType field, int64_t size,
   auto a = prgCreateArray(field, shape, seed_, &counter_, descs.data());
   auto b = prgCreateArray(field, shape, seed_, &counter_, &descs[1]);
 
-  if (lctx_->Rank() == perm_rank) {
+  if (lctx_->Rank() == options_.adjust_rank) {
     auto req = BuildAdjustRequest<beaver::ttp_server::AdjustPermRequest>(
         descs, descs_seed);
-    std::vector<butil::IOBuf> stream_data;
-    size_t left_buf_size = perm_vec.size() * sizeof(int64_t);
-    size_t chunk_idx = 0;
-    while (left_buf_size > 0) {
-      using beaver::ttp_server::kUpStreamChunkSize;
-      size_t cur_chunk_size = std::min(left_buf_size, kUpStreamChunkSize);
-      stream_data.emplace_back();
-      stream_data.back().append(reinterpret_cast<const char*>(perm_vec.data()) +
-                                    (chunk_idx * kUpStreamChunkSize),
-                                cur_chunk_size);
-      ++chunk_idx;
-      left_buf_size -= cur_chunk_size;
-    }
-    auto adjusts = RpcCall(channel_, req, field, &stream_data);
+    auto* perm_meta = req.mutable_perm();
+    perm_meta->set_prg_count(counter_);
+    perm_meta->set_size(size);
+    auto& perm_seed = encrypted_seeds_[perm_rank];
+    perm_meta->set_encrypted_seeds(perm_seed.data(), perm_seed.size());
+    auto adjusts = RpcCall(channel_, req, field);
     SPU_ENFORCE_EQ(adjusts.size(), 1U);
     ring_add_(b, adjusts[0].reshape(b.shape()));
   }
 
-  Pair ret;
-  ret.first = std::move(*a.buf());
-  ret.second = std::move(*b.buf());
+  Index pi;
+  if (lctx_->Rank() == perm_rank) {
+    pi = genRandomPerm(size, seed_, &counter_);
+  }
+
+  auto new_counter_buf = yacl::link::Broadcast(
+      lctx_, yacl::SerializeVars<PrgCounter>(counter_), perm_rank, kTag);
+
+  counter_ = yacl::DeserializeVars<PrgCounter>(new_counter_buf);
+
+  PremTriple ret;
+  std::get<0>(ret) = std::move(*a.buf());
+  std::get<1>(ret) = std::move(*b.buf());
+  std::get<2>(ret) = std::move(pi);
+
   return ret;
 }
 
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/beaver_ttp.h b/libspu/mpc/semi2k/beaver/beaver_impl/beaver_ttp.h
index 501d5eac9..adca54760 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/beaver_ttp.h
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/beaver_ttp.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+#include <optional>
 
 #include "brpc/channel.h"
 #include "yacl/base/buffer.h"
@@ -34,17 +35,15 @@ class BeaverTtp final : public Beaver {
     // asym_crypto_schema: support ["SM2"]
     // Will support 25519 in the future, after yacl supported it.
     std::string asym_crypto_schema;
-    // TODO: Remote Attestation
     yacl::Buffer server_public_key;
     size_t adjust_rank;
 
-    std::string brpc_channel_protocol = "baidu_std";
-    std::string brpc_channel_connection_type = "single";
-    std::string brpc_load_balancer_name;
-    int32_t brpc_timeout_ms = 10 * 1000;
+    std::string brpc_channel_protocol = "http";
+    int32_t brpc_connect_timeout_ms = 10 * 1000;
+    int32_t brpc_timeout_ms = 300 * 1000;
     int32_t brpc_max_retry = 5;
 
-    // TODO: TLS ops for client/server two-way authentication
+    std::optional<brpc::ChannelSSLOptions> brpc_ssl_options;
   };
 
  private:
@@ -87,8 +86,7 @@ class BeaverTtp final : public Beaver {
 
   Array RandBit(FieldType field, int64_t size) override;
 
-  Pair PermPair(FieldType field, int64_t size, size_t perm_rank,
-                absl::Span<const int64_t> perm_vec) override;
+  PremTriple PermPair(FieldType field, int64_t size, size_t perm_rank) override;
 
   std::unique_ptr<Beaver> Spawn() override;
 
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/BUILD.bazel b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/BUILD.bazel
index 8ccd5d276..98cb4fef4 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/BUILD.bazel
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/BUILD.bazel
@@ -28,9 +28,14 @@ proto_library(
     srcs = ["service.proto"],
 )
 
-spu_cc_library(
-    name = "beaver_stream",
-    hdrs = ["beaver_stream.h"],
+cc_proto_library(
+    name = "config_cc_proto",
+    deps = [":config_proto"],
+)
+
+proto_library(
+    name = "config_proto",
+    srcs = ["config.proto"],
 )
 
 spu_cc_library(
@@ -38,10 +43,9 @@ spu_cc_library(
     srcs = ["beaver_server.cc"],
     hdrs = ["beaver_server.h"],
     deps = [
-        ":beaver_stream",
         ":service_cc_proto",
         "//libspu/mpc/semi2k/beaver/beaver_impl/trusted_party",
-        "@com_github_brpc_brpc//:brpc",
+        "@brpc",
         "@yacl//yacl/crypto/pke:sm2_enc",
     ],
 )
@@ -51,6 +55,7 @@ spu_cc_binary(
     srcs = ["beaver_server_main.cc"],
     deps = [
         ":beaver_server",
+        ":config_cc_proto",
         "//libspu/core:logging",
     ],
 )
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.cc b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.cc
index 2a5136ec2..5676eb2b1 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.cc
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.cc
@@ -14,7 +14,10 @@
 
 #include "libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.h"
 
+#include <brpc/progressive_attachment.h>
+
 #include <algorithm>
+#include <cerrno>
 #include <future>
 #include <vector>
 
@@ -27,7 +30,7 @@
 #include "libspu/core/ndarray_ref.h"
 #include "libspu/mpc/common/prg_tensor.h"
 #include "libspu/mpc/semi2k/beaver/beaver_impl/trusted_party/trusted_party.h"
-#include "libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_stream.h"
+#include "libspu/mpc/utils/permute.h"
 
 #include "libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/service.pb.h"
 
@@ -42,14 +45,22 @@ namespace spu::mpc::semi2k::beaver::ttp_server {
 
 namespace {
 
+const int64_t kReplayChunkSize = 32L * 1024 * 1024;
+
 inline size_t CeilDiv(size_t a, size_t b) { return (a + b - 1) / b; }
 
 class DecryptError : public yacl::Exception {
   using yacl::Exception::Exception;
 };
 
+struct PermMeta {
+  uint64_t prg_count;
+  PrgSeed seed;
+  int64_t size;
+};
+
 template <class AdjustRequest>
-std::tuple<std::vector<TrustedParty::Operand>,
+std::tuple<std::vector<TrustedParty::Operand>, PermMeta,
            std::vector<std::vector<PrgSeed>>, size_t>
 BuildOperand(const AdjustRequest& req, uint32_t field_size,
              const std::unique_ptr<yacl::crypto::PkeDecryptor>& decryptor,
@@ -150,107 +161,20 @@ BuildOperand(const AdjustRequest& req, uint32_t field_size,
     }
   }
 
-  return {std::move(ops), std::move(seeds), pad_length};
-}
-
-std::vector<yacl::Buffer> StripNdArray(std::vector<NdArrayRef>& nds,
-                                       size_t pad_length) {
-  std::vector<yacl::Buffer> ret;
-  ret.reserve(nds.size());
-
-  auto if_pad = [&](NdArrayRef& nd) {
-    yacl::Buffer buf = std::move(*nd.buf());
-    if (pad_length > 0) {
-      buf.resize(buf.size() - pad_length);
-    }
-    return buf;
-  };
-
-  for (auto& nd : nds) {
-    ret.push_back(if_pad(nd));
+  PermMeta perm;
+  if constexpr (std::is_same_v<AdjustRequest, AdjustPermRequest>) {
+    const PrgRandPermMeta& perm_meta = req.perm();
+    perm.prg_count = perm_meta.prg_count();
+    perm.size = perm_meta.size();
+    perm.seed = try_decrypt(perm_meta.encrypted_seeds());
   }
 
-  return ret;
+  return {std::move(ops), std::move(perm), std::move(seeds), pad_length};
 }
 
 template <class T>
 struct dependent_false : std::false_type {};
 
-class StreamReader : public brpc::StreamInputHandler {
- public:
-  enum class Status : int8_t {
-    kNotFinished,
-    kNormalFinished,
-    kAbnormalFinished,
-    kStreamFailed,
-  };
-
-  explicit StreamReader(size_t total_buf_len) {
-    total_buf_len_ = total_buf_len;
-    future_finished_ = promise_finished_.get_future();
-    future_closed_ = promise_closed_.get_future();
-  }
-
-  int on_received_messages(brpc::StreamId id, butil::IOBuf* const messages[],
-                           size_t size) override {
-    SPDLOG_DEBUG("on_received_messages, stream id: {}", id);
-    for (size_t i = 0; i < size; ++i) {
-      if (status_ != Status::kNotFinished) {
-        SPDLOG_WARN("unexpected messages received");
-        return -1;
-      }
-      const auto& message = messages[i];
-      SPDLOG_DEBUG("receive buf size: {}", message->size());
-      buf_.append(message->movable());
-      if (buf_.length() == total_buf_len_) {
-        status_ = Status::kNormalFinished;
-        promise_finished_.set_value(status_);
-      } else if (buf_.length() > total_buf_len_) {
-        SPDLOG_ERROR("buf length ({}) greater than expected buf size ({})",
-                     buf_.length(), total_buf_len_);
-        status_ = Status::kAbnormalFinished;
-        promise_finished_.set_value(status_);
-      }
-    }
-    return 0;
-  }
-
-  void on_idle_timeout(brpc::StreamId id) override {
-    SPDLOG_INFO("Stream {} idle timeout", id);
-  }
-
-  void on_closed(brpc::StreamId id) override {
-    SPDLOG_DEBUG("Stream {} closed", id);
-    promise_closed_.set_value();
-  }
-
-  void on_failed(brpc::StreamId id, int error_code,
-                 const std::string& error_text) override {
-    SPDLOG_ERROR("Stream {} failed, error_code: {}, error_text: {}", id,
-                 error_code, error_text);
-    status_ = Status::kStreamFailed;
-    promise_finished_.set_value(status_);
-  }
-
-  const auto& GetBufRef() const {
-    SPU_ENFORCE(status_ == Status::kNormalFinished);
-    return buf_;
-  }
-
-  Status WaitFinished() { return future_finished_.get(); };
-
-  void WaitClosed() { future_closed_.wait(); }
-
- private:
-  butil::IOBuf buf_;
-  size_t total_buf_len_;
-  Status status_ = Status::kNotFinished;
-  std::promise<Status> promise_finished_;
-  std::promise<void> promise_closed_;
-  std::future<Status> future_finished_;
-  std::future<void> future_closed_;
-};
-
 template <class AdjustRequest>
 size_t GetBufferLength(const AdjustRequest& req) {
   if constexpr (std::is_same_v<AdjustRequest,
@@ -266,42 +190,71 @@ size_t GetBufferLength(const AdjustRequest& req) {
   return 0;
 }
 
-void SendStreamData(brpc::StreamId stream_id,
-                    absl::Span<const yacl::Buffer> buf_vec) {
-  SPU_ENFORCE(!buf_vec.empty());
-  for (size_t idx = 1; idx < buf_vec.size(); ++idx) {
-    SPU_ENFORCE_EQ(buf_vec[0].size(), buf_vec[idx].size());
-  }
+void HandleStreamingError(
+    butil::intrusive_ptr<brpc::ProgressiveAttachment>& pa) {
+  int errsv = errno;
+  YACL_THROW_IO_ERROR("streaming Write error, errno {}, strerror {}, client {}",
+                      errsv, strerror(errsv),
+                      butil::endpoint2str(pa->remote_side()).c_str());
+}
+
+void SendStreamData(const std::vector<NdArrayRef>& adjusts,
+                    butil::intrusive_ptr<brpc::ProgressiveAttachment>& pa,
+                    int64_t pad_length = 0) {
+  SPU_ENFORCE(!adjusts.empty());
 
-  size_t chunk_size = kDownStreamChunkSize / buf_vec.size();
   // FIXME: TTP adjuster server and client MUST have same endianness.
-  size_t left_buf_size = buf_vec[0].size();
-  int64_t chunk_idx = 0;
-  while (left_buf_size > 0) {
-    butil::IOBuf io_buf;
-    BeaverDownStreamMeta meta;
-    io_buf.append(&meta, sizeof(meta));
-
-    size_t cur_chunk_size = std::min(left_buf_size, chunk_size);
-    for (const auto& buf : buf_vec) {
-      int ret = io_buf.append(buf.data<char>() + (chunk_idx * chunk_size),
-                              cur_chunk_size);
-      SPU_ENFORCE_EQ(ret, 0, "Append data to IO buffer failed");
+  for (const auto& adjust : adjusts) {
+    const auto& buf = adjust.buf();
+    const auto* data = buf->data<uint8_t>();
+    const int64_t need_seed = buf->size() - pad_length;
+
+    int64_t pos = 0;
+    while (pos < need_seed) {
+      const int64_t send_size = std::min(need_seed - pos, kReplayChunkSize);
+      std::array<uint8_t, 1 + sizeof(int64_t)> flags;
+      flags[0] = 0;
+      std::memcpy(&flags[1], &send_size, sizeof(int64_t));
+      if (pa->Write(flags.data(), flags.size()) != 0) {
+        HandleStreamingError(pa);
+      }
+      if (pa->Write(data + pos, send_size) != 0) {
+        HandleStreamingError(pa);
+      }
+      pos += send_size;
     }
+  }
+}
 
-    // StreamWrite result cannot be EAGAIN, given that we have not set
-    // max_buf_size
-    SPU_ENFORCE_EQ(brpc::StreamWrite(stream_id, io_buf), 0);
+void SendError(butil::intrusive_ptr<brpc::ProgressiveAttachment>& pa,
+               ErrorCode code, const std::string& err) {
+  std::array<uint8_t, 1 + sizeof(int64_t)> flags;
+  int64_t err_size = err.size();
+  flags[0] = code;
+  // FIXME: TTP adjuster server and client MUST have same endianness.
+  std::memcpy(&flags[1], &err_size, sizeof(int64_t));
 
-    left_buf_size -= cur_chunk_size;
-    ++chunk_idx;
+  try {
+    if (pa->Write(flags.data(), flags.size()) != 0) {
+      HandleStreamingError(pa);
+    }
+    if (pa->Write(err.data(), err.size()) != 0) {
+      HandleStreamingError(pa);
+    }
+  } catch (const std::exception& e) {
+    // streaming write error, we can do nothing but logging
+    SPDLOG_ERROR(
+        "error happend during send error to client, error try to send {}, "
+        "error happend {}",
+        err, e.what());
+    return;
   }
 }
 
 template <class AdjustRequest>
 std::vector<NdArrayRef> AdjustImpl(const AdjustRequest& req,
                                    absl::Span<TrustedParty::Operand> ops,
-                                   StreamReader& stream_reader) {
+                                   const PermMeta& perm) {
   std::vector<NdArrayRef> ret;
   if constexpr (std::is_same_v<AdjustRequest, AdjustMulRequest>) {
     auto adjust = TrustedParty::adjustMul(ops);
@@ -332,14 +285,8 @@ std::vector<NdArrayRef> AdjustImpl(const AdjustRequest& req,
     auto adjust = TrustedParty::adjustEqz(ops);
     ret.push_back(std::move(adjust));
   } else if constexpr (std::is_same_v<AdjustRequest, AdjustPermRequest>) {
-    auto status = stream_reader.WaitFinished();
-    SPU_ENFORCE(status == StreamReader::Status::kNormalFinished,
-                "Stream reader finished abnormally, status: {}",
-                static_cast<int32_t>(status));
-    const auto& buf = stream_reader.GetBufRef();
-    SPU_ENFORCE(buf.length() % sizeof(int64_t) == 0);
-    std::vector<int64_t> pv(buf.length() / sizeof(int64_t));
-    buf.copy_to(pv.data());
+    uint64_t prg_count = perm.prg_count;
+    auto pv = genRandomPerm(perm.size, perm.seed, &prg_count);
     auto adjust = TrustedParty::adjustPerm(ops, pv);
     ret.push_back(std::move(adjust));
   } else {
@@ -352,57 +299,91 @@ std::vector<NdArrayRef> AdjustImpl(const AdjustRequest& req,
 
 template <class AdjustRequest>
 void AdjustAndSend(
-    const AdjustRequest& req, brpc::StreamId stream_id,
-    StreamReader& stream_reader,
+    brpc::Controller* cntl, const AdjustRequest* req,
+    ::google::protobuf::Closure* done,
     const std::unique_ptr<yacl::crypto::PkeDecryptor>& decryptor) {
-  size_t field_size;
-  if constexpr (std::is_same_v<AdjustRequest, AdjustAndRequest>) {
-    field_size = 128 / 8;
-  } else {
-    field_size = req.field_size();
-  }
-  ElementType eltype = ElementType::kRing;
-  // enable eltype for selected requests here
-  // later all requests may support gfmp
-  if constexpr (std::is_same_v<AdjustRequest, AdjustMulRequest> ||
-                std::is_same_v<AdjustRequest, AdjustMulPrivRequest>) {
-    if (req.element_type() == ElType::GFMP) {
-      eltype = ElementType::kGfmp;
+  std::string client_side(butil::endpoint2str(cntl->remote_side()).c_str());
+  auto pa = cntl->CreateProgressiveAttachment();
+
+  std::tuple<std::vector<TrustedParty::Operand>, PermMeta,
+             std::vector<std::vector<PrgSeed>>, size_t>
+      adjust_params;
+
+  // AdjustAndSend using streaming send, needs call done before starting
+  // calculation, done will free req, but calculation needs to use req
+  // so we make a copy here.
+  const auto request = *req;
+  {
+    brpc::ClosureGuard done_guard(done);
+    try {
+      size_t field_size;
+      if constexpr (std::is_same_v<AdjustRequest, AdjustAndRequest>) {
+        field_size = 128 / 8;
+      } else {
+        field_size = request.field_size();
+      }
+      ElementType eltype = ElementType::kRing;
+      // enable eltype for selected requests here
+      // later all requests may support gfmp
+      if constexpr (std::is_same_v<AdjustRequest, AdjustMulRequest> ||
+                    std::is_same_v<AdjustRequest, AdjustMulPrivRequest>) {
+        if (request.element_type() == ElType::GFMP) {
+          eltype = ElementType::kGfmp;
+        }
+      }
+      adjust_params = BuildOperand(request, field_size, decryptor, eltype);
+    } catch (const DecryptError& e) {
+      auto err = fmt::format("Seed Decrypt error {}", e.what());
+      SPDLOG_ERROR("{}, client {}", err, client_side);
+      SendError(pa, ErrorCode::SeedDecryptError, err);
+      return;
+    } catch (const std::exception& e) {
+      auto err = fmt::format("adjust error {}", e.what());
+      SPDLOG_ERROR("{}, client {}", err, client_side);
+      SendError(pa, ErrorCode::OpAdjustError, err);
+      return;
     }
   }
-  auto [ops, seeds, pad_length] =
-      BuildOperand(req, field_size, decryptor, eltype);
-
-  if constexpr (std::is_same_v<AdjustRequest, AdjustDotRequest> ||
-                std::is_same_v<AdjustRequest, AdjustPermRequest>) {
-    auto adjusts = AdjustImpl(req, absl::MakeSpan(ops), stream_reader);
-    auto buf_vec = StripNdArray(adjusts, pad_length);
-    SendStreamData(stream_id, buf_vec);
-    return;
-  }
 
-  SPU_ENFORCE_EQ(beaver::ttp_server::kReplayChunkSize % 128, 0U);
-  SPU_ENFORCE(!ops.empty());
-  for (size_t idx = 1; idx < ops.size(); idx++) {
-    SPU_ENFORCE(ops[0].desc.shape == ops[idx].desc.shape);
-  }
-  int64_t left_elements = ops[0].desc.shape.at(0);
-  int64_t chunk_elements =
-      beaver::ttp_server::kReplayChunkSize / SizeOf(ops[0].desc.field);
-  while (left_elements > 0) {
-    int64_t cur_elements = std::min(left_elements, chunk_elements);
-    left_elements -= cur_elements;
-    for (auto& op : ops) {
-      op.desc.shape[0] = cur_elements;
-    }
-    auto adjusts = AdjustImpl(req, absl::MakeSpan(ops), stream_reader);
-    if (left_elements > 0) {
-      auto buf_vec = StripNdArray(adjusts, 0);
-      SendStreamData(stream_id, buf_vec);
+  try {
+    auto& [ops, perm, seeds, pad_length] = adjust_params;
+    if constexpr (std::is_same_v<AdjustRequest, AdjustDotRequest> ||
+                  std::is_same_v<AdjustRequest, AdjustPermRequest>) {
+      auto adjusts = AdjustImpl(request, absl::MakeSpan(ops), perm);
+      SendStreamData(adjusts, pa);
     } else {
-      auto buf_vec = StripNdArray(adjusts, pad_length);
-      SendStreamData(stream_id, buf_vec);
+      SPU_ENFORCE_EQ(beaver::ttp_server::kReplayChunkSize % 128, 0U);
+      SPU_ENFORCE(!ops.empty());
+      for (size_t idx = 1; idx < ops.size(); idx++) {
+        SPU_ENFORCE(ops[0].desc.shape == ops[idx].desc.shape);
+      }
+      int64_t left_elements = ops[0].desc.shape.at(0);
+      int64_t chunk_elements =
+          beaver::ttp_server::kReplayChunkSize / SizeOf(ops[0].desc.field);
+      while (left_elements > 0) {
+        int64_t cur_elements = std::min(left_elements, chunk_elements);
+        left_elements -= cur_elements;
+        for (auto& op : ops) {
+          op.desc.shape[0] = cur_elements;
+        }
+        auto adjusts = AdjustImpl(request, absl::MakeSpan(ops), perm);
+        if (left_elements > 0) {
+          SendStreamData(adjusts, pa);
+        } else {
+          SendStreamData(adjusts, pa, pad_length);
+        }
+      }
     }
+  } catch (const yacl::IoError& e) {
+    // streaming write error, we can do nothing but logging
+    SPDLOG_ERROR(e.what());
+    return;
+  } catch (const std::exception& e) {
+    // some other error happened, try send to client.
+    auto err = fmt::format("adjust error {}", e.what());
+    SPDLOG_ERROR("{}, client {}", err, client_side);
+    SendError(pa, ErrorCode::OpAdjustError, err);
+    return;
   }
 }
 
@@ -429,58 +410,7 @@ class ServiceImpl final : public BeaverService {
               const AdjustRequest* req, AdjustResponse* rsp,
               ::google::protobuf::Closure* done) const {
     auto* cntl = static_cast<brpc::Controller*>(controller);
-    std::string client_side(butil::endpoint2str(cntl->remote_side()).c_str());
-    brpc::StreamId stream_id = brpc::INVALID_STREAM_ID;
-    auto request = *req;
-    StreamReader reader(GetBufferLength(*req));
-
-    // To address the scenario where clients transmit data after an RPC
-    // response, give precedence to setting up absl::MakeCleanup before invoking
-    // brpc::ClosureGuard to ensure proper resource management
-    auto cleanup = absl::MakeCleanup([&]() {
-      auto cleanup = absl::MakeCleanup([&]() {
-        if (stream_id != brpc::INVALID_STREAM_ID) {
-          // To avoid encountering a core dump, it is essential to close the
-          // process stream prior to the destruction of the StreamReader object
-          reader.WaitClosed();
-        }
-      });
-      try {
-        AdjustAndSend(request, stream_id, reader, decryptor_);
-      } catch (const DecryptError& e) {
-        auto err = fmt::format("Seed Decrypt error {}", e.what());
-        SPDLOG_ERROR("{}, client {}", err,
-                     client_side);  // TODO: catch the function name
-        BeaverDownStreamMeta meta;
-        meta.err_code = ErrorCode::SeedDecryptError;
-        butil::IOBuf buf;
-        SPU_ENFORCE_EQ(buf.append(&meta, sizeof(meta)), 0);
-        SPU_ENFORCE_EQ(buf.append(err.c_str()), 0);
-        brpc::StreamWrite(stream_id, buf);
-        return;
-      } catch (const std::exception& e) {
-        auto err = fmt::format("adjust error {}", e.what());
-        SPDLOG_ERROR("{}, client {}", err, client_side);
-        BeaverDownStreamMeta meta;
-        meta.err_code = ErrorCode::OpAdjustError;
-        butil::IOBuf buf;
-        SPU_ENFORCE_EQ(buf.append(&meta, sizeof(meta)), 0);
-        SPU_ENFORCE_EQ(buf.append(err.c_str()), 0);
-        brpc::StreamWrite(stream_id, buf);
-        return;
-      }
-    });
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::StreamOptions stream_options;
-    stream_options.max_buf_size = 0;  // there is no flow control for downstream
-    stream_options.handler = &reader;
-    if (brpc::StreamAccept(&stream_id, *cntl, &stream_options) != 0) {
-      SPDLOG_ERROR("Failed to accept stream");
-      rsp->set_code(ErrorCode::StreamAcceptError);
-      return;
-    }
-    rsp->set_code(ErrorCode::OK);
+    AdjustAndSend(cntl, req, done, decryptor_);
   }
 
   void AdjustMul(::google::protobuf::RpcController* controller,
@@ -558,9 +488,13 @@ std::unique_ptr<brpc::Server> RunServer(const ServerOptions& options) {
     return nullptr;
   }
 
-  // TODO: add TLS options for client/server two-way authentication
   brpc::ServerOptions brpc_options;
-  brpc_options.has_builtin_services = true;
+
+  if (options.brpc_ssl_options) {
+    *brpc_options.mutable_ssl_options() = options.brpc_ssl_options.value();
+  }
+
+  brpc_options.has_builtin_services = false;
   if (server->Start(options.port, &brpc_options) != 0) {
     SPDLOG_ERROR("Fail to start Server");
     return nullptr;
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.h b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.h
index 7824057a8..7850d4d35 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.h
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+#include <optional>
 
 #include "brpc/server.h"
 #include "yacl/base/buffer.h"
@@ -26,8 +27,8 @@ struct ServerOptions {
   // asym_crypto_schema: support ["SM2"]
   // Will support 25519 in the future, after yacl supported it.
   std::string asym_crypto_schema;
-  // TODO: Remote Attestation
   yacl::Buffer server_private_key;
+  std::optional<brpc::ServerSSLOptions> brpc_ssl_options;
 };
 
 std::unique_ptr<brpc::Server> RunServer(const ServerOptions& options);
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server_main.cc b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server_main.cc
index 0a701cf3a..fa72b8b54 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server_main.cc
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server_main.cc
@@ -13,23 +13,34 @@
 // limitations under the License.
 
 #include <filesystem>
+#include <optional>
 
 #include "absl/strings/ascii.h"
-#include "butil/base64.h"
+#include "butil/file_util.h"
 #include "gflags/gflags.h"
+#include "google/protobuf/util/json_util.h"
 #include "yacl/crypto/key_utils.h"
 
 #include "libspu/core/logging.h"
 #include "libspu/core/prelude.h"
 #include "libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.h"
 
+#include "libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/config.pb.h"
+
+using spu::mpc::semi2k::beaver::ttp_server::TTPServerConfig;
+
 namespace ttp_server_config {
 DEFINE_bool(
     gen_key, false,
     "if true, gen a pair of asym_crypto_schema key in base64, then exit.");
 DEFINE_string(asym_crypto_schema, "sm2",
               "asym_crypto_schema: support [\"SM2\"]");
-DEFINE_string(server_private_key, "", "base64ed server_private_key");
+DEFINE_string(public_key_out, "sm2-key.pub", "file path to save public key");
+DEFINE_string(private_key_out, "sm2-key", "file path to save private key");
+DEFINE_string(config_file, "/home/admin/server-config.json",
+              "server config file, json format, see config.proto");
+DEFINE_string(private_key_file, "/home/admin/server-private-key",
+              "private key file path");
 DEFINE_int32(port, 9449, "TCP Port of this server");
 DEFINE_string(log_dir, "logs", "log directory");
 DEFINE_bool(enable_console_logger, true,
@@ -37,7 +48,6 @@ DEFINE_bool(enable_console_logger, true,
 DEFINE_int64(max_log_file_size, 100 * 1024 * 1024,
              "max file size for each log file");
 DEFINE_int64(max_log_file_count, 10, "max rotated log files save in dir");
-
 }  // namespace ttp_server_config
 
 void SetupLogging() {
@@ -55,24 +65,37 @@ void SetupLogging() {
 void GenKeyPair(const std::string& asym_crypto_schema) {
   auto lower_schema = absl::AsciiStrToLower(asym_crypto_schema);
 
-  std::pair<yacl::Buffer, yacl::Buffer> asym_crypto_key;
+  yacl::crypto::openssl::UniquePkey asym_crypto_key;
   if (lower_schema == "sm2") {
-    asym_crypto_key = yacl::crypto::GenSm2KeyPairToPemBuf();
+    asym_crypto_key = yacl::crypto::GenSm2KeyPair();
   } else {
     SPU_THROW("not support asym_crypto_schema {}", asym_crypto_schema);
   }
 
-  std::string base64_pk;
-  std::string base64_sk;
-
-  butil::Base64Encode(std::string(asym_crypto_key.first.data<char>(),
-                                  asym_crypto_key.first.size()),
-                      &base64_pk);
-  butil::Base64Encode(std::string(asym_crypto_key.second.data<char>(),
-                                  asym_crypto_key.second.size()),
-                      &base64_sk);
-  SPDLOG_INFO("\nbase64ed public key:\n{}\n\nbase64ed private key:\n{}\n",
-              base64_pk, base64_sk);
+  yacl::crypto::ExportPublicKeyToPemFile(
+      asym_crypto_key, ttp_server_config::FLAGS_public_key_out);
+  yacl::crypto::ExportSecretKeyToDerFile(
+      asym_crypto_key, ttp_server_config::FLAGS_private_key_out);
+}
+
+std::optional<TTPServerConfig> ReadServerConfig() {
+  std::string json;
+  if (!butil::ReadFileToString(
+          butil::FilePath(ttp_server_config::FLAGS_config_file), &json)) {
+    return std::nullopt;
+  }
+
+  TTPServerConfig config;
+  auto status = google::protobuf::util::JsonStringToMessage(json, &config);
+  SPU_ENFORCE(status.ok(), status.ToString());
+
+  return config;
+}
+
+yacl::Buffer ReadPrivateKey() {
+  auto private_key =
+      yacl::crypto::LoadKeyFromFile(ttp_server_config::FLAGS_private_key_file);
+  return yacl::crypto::ExportSecretKeyToPemBuf(private_key);
 }
 
 int main(int argc, char* argv[]) {
@@ -86,19 +109,25 @@ int main(int argc, char* argv[]) {
     return 0;
   }
 
-  yacl::Buffer decode_private_key;
-  {
-    std::string key;
-    SPU_ENFORCE(
-        butil::Base64Decode(ttp_server_config::FLAGS_server_private_key, &key));
-    decode_private_key = yacl::Buffer(key.data(), key.size());
+  spu::mpc::semi2k::beaver::ttp_server::ServerOptions ops;
+  ops.server_private_key = ReadPrivateKey();
+  auto config = ReadServerConfig();
+  if (config.has_value()) {
+    ops.port = config.value().server_port();
+    ops.asym_crypto_schema = config.value().asym_crypto_schema();
+    if (config->has_ssl()) {
+      brpc::ServerSSLOptions ssl_options;
+      ssl_options.default_cert.certificate = config.value().ssl().cert_file();
+      ssl_options.default_cert.private_key = config.value().ssl().key_file();
+      ssl_options.verify.ca_file_path = config.value().ssl().ca_file();
+      ssl_options.verify.verify_depth = config.value().ssl().verify_depth();
+      ops.brpc_ssl_options = std::move(ssl_options);
+    }
+  } else {
+    SPDLOG_INFO("Failed to read config file, use command line options");
+    ops.port = ttp_server_config::FLAGS_port;
+    ops.asym_crypto_schema = ttp_server_config::FLAGS_asym_crypto_schema;
   }
 
-  spu::mpc::semi2k::beaver::ttp_server::ServerOptions ops{
-      .port = ttp_server_config::FLAGS_port,
-      .asym_crypto_schema = ttp_server_config::FLAGS_asym_crypto_schema,
-      .server_private_key = std::move(decode_private_key),
-  };
-
   return spu::mpc::semi2k::beaver::ttp_server::RunUntilAskedToQuit(ops);
 }
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/config.proto b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/config.proto
new file mode 100644
index 000000000..456b5cd56
--- /dev/null
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/config.proto
@@ -0,0 +1,45 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package spu.mpc.semi2k.beaver.ttp_server;
+
+option cc_generic_services = true;
+
+message SSLConfig {
+  // Certificate file in PEM format
+  string cert_file = 1;
+
+  // Private key file in PEM format
+  string key_file = 2;
+
+  // The trusted CA file to verify the peer's certificate
+  string ca_file = 3;
+
+  // Maximum depth of the certificate chain for verification
+  // If 0, turn off the verification
+  int32 verify_depth = 4;
+}
+
+message TTPServerConfig {
+  // Listening port
+  int32 server_port = 1;
+
+  // Asymmetric crypto schema, support ["SM2"]
+  string asym_crypto_schema = 2;
+
+  // Configurations related to SSL
+  SSLConfig ssl = 3;
+}
diff --git a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/service.proto b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/service.proto
index 23fd30253..0549d005c 100644
--- a/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/service.proto
+++ b/libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/service.proto
@@ -22,7 +22,6 @@ enum ErrorCode {
   OK = 0;
   OpAdjustError = 1;
   SeedDecryptError = 2;
-  StreamAcceptError = 3;
 }
 
 // The type of element in the field.
@@ -44,6 +43,13 @@ message PrgBufferMeta {
   bool transpose = 4;
 }
 
+message PrgRandPermMeta {
+  uint64 prg_count = 1;
+  // permutation size
+  int64 size = 2;
+  bytes encrypted_seeds = 3;
+}
+
 // TTP Beaver service for semi2k only.
 service BeaverService {
   // V1 adjust ops
@@ -201,13 +207,12 @@ message AdjustPermRequest {
   repeated PrgBufferMeta prg_inputs = 1;
   // What field size should be used to interpret buffer content
   uint32 field_size = 2;
+  // Rand permutation
+  PrgRandPermMeta perm = 3;
   // output
-  // adjust_b = (apply inverse permutation perm_vec to ra) - rb
+  // adjust_b = (apply inverse permutation perm to ra) - rb
   // make
-  // (adjust_b + rb) = apply inverse permutation perm_vec to ra
+  // (adjust_b + rb) = apply inverse permutation perm to ra
 }
 
-message AdjustResponse {
-  ErrorCode code = 1;
-  string message = 2;
-}
+message AdjustResponse {}
diff --git a/libspu/mpc/semi2k/beaver/beaver_interface.h b/libspu/mpc/semi2k/beaver/beaver_interface.h
index 89c582671..7a7b7f6a2 100644
--- a/libspu/mpc/semi2k/beaver/beaver_interface.h
+++ b/libspu/mpc/semi2k/beaver/beaver_interface.h
@@ -18,6 +18,7 @@
 
 #include "yacl/base/buffer.h"
 
+#include "libspu/core/shape.h"
 #include "libspu/mpc/common/prg_tensor.h"
 
 #include "libspu/spu.pb.h"
@@ -46,6 +47,7 @@ class Beaver {
 
   using Array = yacl::Buffer;
   using Triple = std::tuple<Array, Array, Array>;
+  using PremTriple = std::tuple<Array, Array, Index>;
   using Pair = std::pair<Array, Array>;
 
   virtual ~Beaver() = default;
@@ -85,20 +87,21 @@ class Beaver {
 
   // Generate share permutation pair.
   /*
-          ┌───────────────────────┐
-          │                       │   A i
-  Perm    │      Permutation      ├─────►
-  ───────►│                       │   B i
-          │    Pair  Generator    ├─────►
-          │                       │
+          ┌───────────────────────┐   A i
+          │                       ├─────►
+  size    │      Permutation      │   B i
+ ────────►│                       ├─────►
+          │    Pair  Generator    │   π
+          │                       ├─────►
           └───────────────────────┘
 
-                InversePerm(A) = B
+           InversePermute(A, π) = B
 
-  if perm_rank == lctx->Rank(); perm not empty.
+  if rank == perm_rank ret[2] is π, otherwise, ret[2] is empty.
+  perm_rank should use ret[2] as a Span<const int64_t>(buffer, size) view.
   */
-  virtual Pair PermPair(FieldType field, int64_t size, size_t perm_rank,
-                        absl::Span<const int64_t> perm_vec) = 0;
+  virtual PremTriple PermPair(FieldType field, int64_t size,
+                              size_t perm_rank) = 0;
 
   virtual std::unique_ptr<Beaver> Spawn() = 0;
 
diff --git a/libspu/mpc/semi2k/boolean.cc b/libspu/mpc/semi2k/boolean.cc
index 707eb91a4..a303f7f2d 100644
--- a/libspu/mpc/semi2k/boolean.cc
+++ b/libspu/mpc/semi2k/boolean.cc
@@ -87,6 +87,22 @@ NdArrayRef CastTypeB::proc(KernelEvalContext*, const NdArrayRef& in,
   return in.as(to_type);
 }
 
+NdArrayRef RandB::proc(KernelEvalContext* ctx, const Shape& shape) const {
+  auto* prg_state = ctx->getState<PrgState>();
+  const auto field = ctx->getState<Z2kState>()->getDefaultField();
+
+  return DISPATCH_ALL_FIELDS(field, [&]() {
+    auto r = prg_state->genPriv(field, shape);
+    // only rand bit is supported
+    const size_t nbits = 1;
+    NdArrayView<ring2k_t> _r(r);
+
+    pforeach(0, shape.numel(), [&](int64_t idx) { _r[idx] = _r[idx] & 1; });
+
+    return makeBShare(r, field, nbits);
+  });
+}
+
 NdArrayRef B2P::proc(KernelEvalContext* ctx, const NdArrayRef& in) const {
   const auto field = in.eltype().as<Ring2k>()->field();
   auto* comm = ctx->getState<Communicator>();
@@ -133,10 +149,12 @@ NdArrayRef AndBP::proc(KernelEvalContext* ctx, const NdArrayRef& lhs,
 NdArrayRef AndBB::proc(KernelEvalContext* ctx, const NdArrayRef& lhs,
                        const NdArrayRef& rhs) const {
   SPU_ENFORCE(lhs.shape() == rhs.shape());
+  SPU_ENFORCE(lhs.eltype().as<Ring2k>()->field() ==
+              rhs.eltype().as<Ring2k>()->field());
 
   auto* comm = ctx->getState<Communicator>();
   auto* beaver = ctx->getState<Semi2kState>()->beaver();
-  const auto field = ctx->getState<Z2kState>()->getDefaultField();
+  const auto field = lhs.eltype().as<Ring2k>()->field();
 
   const size_t out_nbits = std::min(getNumBits(lhs), getNumBits(rhs));
   const PtType backtype = getBacktype(out_nbits);
@@ -192,6 +210,8 @@ NdArrayRef AndBB::proc(KernelEvalContext* ctx, const NdArrayRef& lhs,
 NdArrayRef XorBP::proc(KernelEvalContext* ctx, const NdArrayRef& lhs,
                        const NdArrayRef& rhs) const {
   SPU_ENFORCE(lhs.numel() == rhs.numel());
+  SPU_ENFORCE(lhs.eltype().as<Ring2k>()->field() ==
+              rhs.eltype().as<Ring2k>()->field());
 
   auto* comm = ctx->getState<Communicator>();
 
@@ -208,8 +228,10 @@ NdArrayRef XorBP::proc(KernelEvalContext* ctx, const NdArrayRef& lhs,
 NdArrayRef XorBB::proc(KernelEvalContext* ctx, const NdArrayRef& lhs,
                        const NdArrayRef& rhs) const {
   SPU_ENFORCE(lhs.numel() == rhs.numel());
+  SPU_ENFORCE(lhs.eltype().as<Ring2k>()->field() ==
+              rhs.eltype().as<Ring2k>()->field());
 
-  const auto field = ctx->getState<Z2kState>()->getDefaultField();
+  const auto field = lhs.eltype().as<Ring2k>()->field();
   const size_t out_nbits = std::max(getNumBits(lhs), getNumBits(rhs));
   return makeBShare(ring_xor(lhs, rhs), field, out_nbits);
 }
diff --git a/libspu/mpc/semi2k/boolean.h b/libspu/mpc/semi2k/boolean.h
index 766e39c81..3ec63505f 100644
--- a/libspu/mpc/semi2k/boolean.h
+++ b/libspu/mpc/semi2k/boolean.h
@@ -39,6 +39,17 @@ class CastTypeB : public CastTypeKernel {
                   const Type& to_type) const override;
 };
 
+class RandB : public RandKernel {
+ public:
+  static constexpr const char* kBindName() { return "rand_b"; };
+
+  ce::CExpr latency() const override { return ce::Const(0); }
+
+  ce::CExpr comm() const override { return ce::Const(0); }
+
+  NdArrayRef proc(KernelEvalContext* ctx, const Shape& shape) const override;
+};
+
 class B2P : public UnaryKernel {
  public:
   static constexpr const char* kBindName() { return "b2p"; }
diff --git a/libspu/mpc/semi2k/lowmc.cc b/libspu/mpc/semi2k/lowmc.cc
new file mode 100644
index 000000000..993e563fa
--- /dev/null
+++ b/libspu/mpc/semi2k/lowmc.cc
@@ -0,0 +1,393 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libspu/mpc/semi2k/lowmc.h"
+
+#include "libspu/mpc/ab_api.h"
+#include "libspu/mpc/common/prg_state.h"
+#include "libspu/mpc/common/pv2k.h"
+#include "libspu/mpc/semi2k/type.h"
+#include "libspu/mpc/utils/lowmc.h"
+#include "libspu/mpc/utils/lowmc_utils.h"
+#include "libspu/mpc/utils/ring_ops.h"
+
+namespace spu::mpc::semi2k {
+
+namespace {
+
+NdArrayRef wrap_xor_bp(SPUContext* ctx, const NdArrayRef& x,
+                       const NdArrayRef& y) {
+  SPU_ENFORCE(x.shape() == y.shape());
+  return UnwrapValue(xor_bp(ctx, WrapValue(x), WrapValue(y)));
+}
+
+NdArrayRef wrap_xor_bb(SPUContext* ctx, const NdArrayRef& x,
+                       const NdArrayRef& y) {
+  SPU_ENFORCE(x.shape() == y.shape());
+  return UnwrapValue(xor_bb(ctx, WrapValue(x), WrapValue(y)));
+}
+
+NdArrayRef wrap_and_bb(SPUContext* ctx, const NdArrayRef& x,
+                       const NdArrayRef& y) {
+  SPU_ENFORCE(x.shape() == y.shape());
+  return UnwrapValue(and_bb(ctx, WrapValue(x), WrapValue(y)));
+}
+
+/// Some shape utils
+NdArrayRef extract_bit_arr(const NdArrayRef& in, int64_t idx) {
+  const auto field = in.eltype().as<BShrTy>()->field();
+  SPU_ENFORCE((uint64_t)idx < SizeOf(field) * 8, "bit extract out of range.");
+  const auto bty = makeType<BShrTy>(field, 1);
+
+  NdArrayRef out(bty, in.shape());
+  DISPATCH_ALL_FIELDS(field, [&]() {
+    NdArrayView<ring2k_t> _in(in);
+    NdArrayView<ring2k_t> _out(out);
+
+    pforeach(0, in.numel(), [&](int64_t i) {  //
+      _out[i] = (_in[i] >> idx) & 1;
+    });
+  });
+
+  return out;
+}
+
+// offset=0 means c, offset=2 means a
+NdArrayRef extract_packed_bit_arr(const NdArrayRef& state, int64_t n_boxes,
+                                  int64_t offset) {
+  const auto field = state.eltype().as<BShrTy>()->field();
+  const auto bty = makeType<BShrTy>(field, 1);
+
+  const auto& ori_shape = state.shape();
+  const auto ori_numel = ori_shape.numel();
+  Shape to_shape = ori_shape;
+  to_shape[0] = ori_shape[0] * n_boxes;
+
+  NdArrayRef ret(bty, to_shape);
+  DISPATCH_ALL_FIELDS(field, [&]() {
+    NdArrayView<ring2k_t> _state(state);
+    NdArrayView<ring2k_t> _ret(ret);
+
+    for (int64_t i = 0; i < n_boxes; ++i) {
+      const auto start_idx = 3 * i;
+      pforeach(0, ori_shape.numel(), [&](int64_t idx) {  //
+        _ret[idx + i * ori_numel] = (_state[idx] >> (start_idx + offset)) & 1;
+      });
+    }
+  });
+
+  return ret;
+}
+
+// do memory copying by hand, get packed (abc, bca)
+std::tuple<NdArrayRef, NdArrayRef> construct_concat_arr(const NdArrayRef& state,
+                                                        int64_t n_boxes) {
+  const auto field = state.eltype().as<BShrTy>()->field();
+  const auto bty = makeType<BShrTy>(field, 3);
+
+  const auto& ori_shape = state.shape();
+  const auto ori_numel = ori_shape.numel();
+  Shape to_shape = ori_shape;
+  to_shape[0] = ori_shape[0] * n_boxes;
+
+  NdArrayRef abc(bty, to_shape);
+  NdArrayRef bca(bty, to_shape);
+
+  DISPATCH_ALL_FIELDS(field, [&]() {
+    NdArrayView<ring2k_t> _state(state);
+    NdArrayView<ring2k_t> _abc(abc);
+    NdArrayView<ring2k_t> _bca(bca);
+
+    for (int64_t i = 0; i < n_boxes; ++i) {
+      const auto start_idx = 3 * i;
+      pforeach(0, ori_shape.numel(), [&](int64_t idx) {
+        // xxxx xabc => 0000 0abc
+        _abc[idx + i * ori_numel] = (_state[idx] >> start_idx) & 7;
+        // xxxx xabc => 0000 0bca
+        _bca[idx + i * ori_numel] = (((_state[idx] >> start_idx) & 3) << 1) |
+                                    ((_state[idx] >> (start_idx + 2)) & 1);
+      });
+    }
+  });
+
+  return std::make_tuple(abc, bca);
+}
+
+// for shape (k * n0, n1, ...),
+// get array with shape = (n0, n1, ...)
+NdArrayRef slice_arr(const NdArrayRef& x, int64_t idx, const Shape& ori_shape) {
+  const auto& whole_shape = x.shape();
+  SPU_ENFORCE(ori_shape.ndim() == whole_shape.ndim(), "axis mismatch.");
+  SPU_ENFORCE(std::equal(whole_shape.begin() + 1, whole_shape.end(),
+                         ori_shape.begin() + 1),
+              "mismatch of shape.");
+
+  // compute slice indices
+  Index start_ind(ori_shape.ndim(), 0);
+  start_ind[0] = idx * ori_shape[0];
+  Index end_ind(ori_shape.begin(), ori_shape.end());
+  end_ind[0] = start_ind[0] + ori_shape[0];
+
+  return x.slice(start_ind, end_ind, {});
+}
+
+/// Some core operations for LowMC layer
+NdArrayRef Sbox(KernelEvalContext* ctx, const NdArrayRef& state,
+                int64_t n_boxes, size_t n_bits) {
+  // for SboxLayer, the initial definition is a look-up table, we use some
+  // logical operations to replace it.
+  // i.e. Sbox(a, b, c) = (a + b * c, a + b + a * c, a + b + c + a * b),
+  // where `+` is XOR, `*` is AND
+  // TODO: Lots of memory copying here to save rounds, use FM8 for temporay
+  // a,b,c to save memory
+  NdArrayRef abc_arr;
+  NdArrayRef bca_arr;
+  // the origin data: ... a2b2c2 a1b1c1 a0b0c0
+  // we concat all abc to get [a2b2c2; a1b1c1; a0b0c0]
+  // we concat all bca to get [b2c2a2; b1c1a1; b0c0a0]
+  std::tie(abc_arr, bca_arr) = construct_concat_arr(state, n_boxes);
+
+  // doing all expensive secret and op simultaneously
+  auto abc_and_bca_arr = wrap_and_bb(ctx->sctx(), abc_arr, bca_arr);
+  auto abc_xor_bca_arr = wrap_xor_bb(ctx->sctx(), abc_arr, bca_arr);
+
+  // extract all ab, bc, ac
+  auto ab_arr = extract_bit_arr(abc_and_bca_arr, 2);
+  auto bc_arr = extract_bit_arr(abc_and_bca_arr, 1);
+  auto ac_arr = extract_bit_arr(abc_and_bca_arr, 0);
+
+  // extract a+b, b+c
+  auto a_b_arr = extract_bit_arr(abc_xor_bca_arr, 2);
+  auto b_c_arr = extract_bit_arr(abc_xor_bca_arr, 1);
+
+  // extract a
+  auto a_arr = extract_packed_bit_arr(state, n_boxes, 2);
+
+  // a + b * c
+  auto new_a = wrap_xor_bb(ctx->sctx(), a_arr, bc_arr);
+  // a + b + a * c
+  auto new_b = wrap_xor_bb(ctx->sctx(), a_b_arr, ac_arr);
+  // a + b + c + a * b
+  auto a_b_c_arr = wrap_xor_bb(ctx->sctx(), b_c_arr, a_arr);
+  auto new_c = wrap_xor_bb(ctx->sctx(), a_b_c_arr, ab_arr);
+
+  std::vector<NdArrayRef> bits_arr;
+  bits_arr.reserve(n_bits);
+  const auto& ori_shape = state.shape();
+  // collect first 3*n_boxes bits
+  for (int64_t i = 0; i < n_boxes; ++i) {
+    bits_arr.push_back(slice_arr(new_c, i, ori_shape));
+    bits_arr.push_back(slice_arr(new_b, i, ori_shape));
+    bits_arr.push_back(slice_arr(new_a, i, ori_shape));
+  }
+
+  // concat all bits
+  const auto field = state.eltype().as<BShrTy>()->field();
+  auto ret = ring_zeros(field, state.shape()).as(state.eltype());
+
+  DISPATCH_ALL_FIELDS(field, [&]() {
+    NdArrayView<ring2k_t> _ret(ret);
+    NdArrayView<ring2k_t> _state(state);
+
+    for (int64_t i = 0; i < 3 * n_boxes; ++i) {
+      NdArrayView<ring2k_t> _tmp(bits_arr[i]);
+
+      pforeach(0, ret.numel(), [&](int64_t idx) {  //
+        _ret[idx] = _ret[idx] | ((_tmp[idx] & 1) << i);
+      });
+    }
+
+    // The rest higher bits stay unchanged in SBoxLayer, so we copy them
+    pforeach(0, ret.numel(), [&](int64_t idx) {  //
+      _ret[idx] = _ret[idx] | ((_state[idx] >> (3 * n_boxes)) << (3 * n_boxes));
+    });
+  });
+
+  return ret;
+}
+
+NdArrayRef Affine(KernelEvalContext* ctx, const LowMC& cipher,
+                  const NdArrayRef& state, int64_t rounds) {
+  const auto field = state.eltype().as<BShrTy>()->field();
+
+  const auto L_matrix = cipher.Lmat()[rounds];
+  return dot_product_gf2(L_matrix, state, field);
+}
+
+}  // namespace
+
+NdArrayRef LowMcB::proc(KernelEvalContext* ctx, const NdArrayRef& in) const {
+  auto* prg_state = ctx->getState<PrgState>();
+
+  // generate the shared key, key0 ^ key1 = key
+  uint128_t key;
+  prg_state->fillPriv(absl::MakeSpan(&key, 1));
+
+  // generate public seed
+  uint128_t seed;
+  prg_state->fillPubl(absl::MakeSpan(&seed, 1));
+
+  return encrypt(ctx, in, key, seed);
+}
+
+NdArrayRef LowMcB::encrypt(KernelEvalContext* ctx, const NdArrayRef& in,
+                           uint128_t key, uint128_t seed) const {
+  const auto field = in.eltype().as<BShrTy>()->field();
+  const auto numel = in.numel();
+  const auto k = SizeOf(field) * 8;
+  const auto shape = in.shape();
+  const auto pub_ty = makeType<Pub2kTy>(field);
+
+  NdArrayRef out;
+  DISPATCH_ALL_FIELDS(field, [&]() {
+    auto d = get_data_complexity(numel);
+    auto cipher = LowMC(field, seed, d);
+    SPU_ENFORCE(static_cast<int64_t>(k) == cipher.data_block_size(),
+                "block size must be equal now.");
+
+    // generate round keys
+    auto round_keys =
+        generate_round_keys(cipher.Kmat(), key, cipher.rounds(), field);
+
+    // Following the same steps as in plaintext, with MPC primitives for bit
+    // operations.
+    //
+    // 1. key whiten: state = in ^ roundKeys[0]
+    auto round_key0 = round_keys[0].broadcast_to(shape, {}).as(pub_ty);
+    out = wrap_xor_bb(ctx->sctx(), in, round_key0);
+
+    // 2. round loop: for i = 1 to r
+    // state = SboxLayer(state)
+    // state = GF2Dot(Lmatrix[i-1], state)
+    // state = state ^ RoundConstants[i-1]
+    // state = state ^ RoundKeys[i]
+    const auto n_boxes = cipher.number_of_boxes();
+    SPU_ENFORCE((int64_t)k >= 3 * n_boxes, "invalid parameters setting.");
+
+    for (int64_t r = 1; r <= cipher.rounds(); ++r) {
+      // The only Non Linear Layer in LowMC
+      out = Sbox(ctx, out, n_boxes, k);
+
+      out = Affine(ctx, cipher, out, /*round idx*/ r - 1).as(in.eltype());
+
+      auto round_constant =
+          cipher.RoundConstants()[r - 1].broadcast_to(shape, {}).as(pub_ty);
+      out = wrap_xor_bp(ctx->sctx(), out, round_constant);
+
+      auto round_key = round_keys[r].broadcast_to(shape, {}).as(pub_ty);
+      out = wrap_xor_bb(ctx->sctx(), out, round_key);
+    }
+  });
+
+  return out;
+}
+
+namespace {
+NdArrayRef wrap_lowmcb(KernelEvalContext* ctx, const NdArrayRef& in) {
+  return LowMcB().proc(ctx, in);
+}
+
+FieldType get_dst_field(const int64_t k) {
+  if (k <= 32) {
+    return FM32;
+  } else if (k <= 64) {
+    return FM64;
+  } else {
+    // no matther how large k is, we always use FM128.
+    return FM128;
+  }
+}
+
+NdArrayRef concate_bits(const std::vector<NdArrayRef>& inputs,
+                        const FieldType dst_field) {
+  const auto field = inputs[0].eltype().as<Ring2k>()->field();
+  const auto k = SizeOf(field) * 8;
+
+  SPU_ENFORCE(k * inputs.size() <= SizeOf(dst_field) * 8,
+              "too much inputs to concat!");
+
+  auto ret = ring_zeros(dst_field, inputs[0].shape());
+
+  DISPATCH_ALL_FIELDS(field, [&]() {
+    using src_el_t = ring2k_t;
+
+    DISPATCH_ALL_FIELDS(dst_field, [&]() {
+      using dst_el_t = ring2k_t;
+      NdArrayView<dst_el_t> _ret(ret);
+
+      for (uint64_t i = 0; i < inputs.size(); ++i) {
+        NdArrayView<src_el_t> _inp(inputs[i]);
+        const auto shift_bits = k * i;
+
+        pforeach(0, ret.numel(), [&](int64_t idx) {  //
+          _ret[idx] |= (static_cast<dst_el_t>(_inp[idx]) << shift_bits);
+        });
+      }
+    });
+  });
+
+  return ret;
+}
+
+}  // namespace
+
+NdArrayRef MultiKeyLowMcB::proc(KernelEvalContext* ctx,
+                                const std::vector<NdArrayRef>& inputs) const {
+  SPU_ENFORCE(!inputs.empty());
+  const auto field = inputs[0].eltype().as<Ring2k>()->field();
+  SPU_ENFORCE(std::all_of(inputs.begin() + 1, inputs.end(),
+                          [&field](const NdArrayRef& v) {
+                            return v.eltype().as<Ring2k>()->field() == field;
+                          }),
+              "all inputs must have the same field");
+
+  if (inputs.size() == 1) {
+    return wrap_lowmcb(ctx, inputs[0]);
+  }
+
+  // SPU can now only native support FM128.
+  static constexpr int64_t kMaxBits = 128;
+  static constexpr FieldType kMaxField = FM128;
+
+  const int64_t k = SizeOf(field) * 8;
+  const auto total_bits = k * inputs.size();
+
+  if (total_bits <= kMaxBits) {
+    // just concat all bits if SPU can handle it.
+    const auto dst_field = get_dst_field(total_bits);
+    auto concat_inp =
+        concate_bits(inputs, dst_field).as(makeType<BShrTy>(dst_field));
+    return wrap_lowmcb(ctx, concat_inp);
+  } else {
+    // re-mapping to FM128
+    auto* prg_state = ctx->getState<PrgState>();
+    const Shape rand_mat_shape = {kMaxBits};
+    auto remapping_inp = ring_zeros(kMaxField, inputs[0].shape());
+    // e.g. inputs = [x0, x1, x2, x3], each xi is 64 bits, we want to remap
+    // these to 128 bits.
+    // Conceptually, we generate a public random binary matrix M (shape = (128,
+    // 64*4)), compute gf2dot(M, inputs), which is 128 bits output.
+    for (const auto& item : inputs) {
+      // logically, (128, k) binary matrix
+      const auto rand_mat = prg_state->genPubl(field, rand_mat_shape);
+      // split the large gf2dot into several small gf2dot and use xor to combine
+      // them.
+      auto part_dot = dot_product_gf2(rand_mat, item, kMaxField);
+      ring_xor_(remapping_inp, part_dot);
+    }
+    return wrap_lowmcb(ctx, remapping_inp.as(makeType<BShrTy>(kMaxField)));
+  }
+}
+
+}  // namespace spu::mpc::semi2k
diff --git a/libspu/mpc/semi2k/lowmc.h b/libspu/mpc/semi2k/lowmc.h
new file mode 100644
index 000000000..c64262ffa
--- /dev/null
+++ b/libspu/mpc/semi2k/lowmc.h
@@ -0,0 +1,72 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "libspu/mpc/kernel.h"
+
+namespace spu::mpc::semi2k {
+
+// ref: Ciphers for MPC and FHE
+// https://eprint.iacr.org/2016/687.pdf
+//
+// LowMC cipher is a MPC-friendly block cipher which minimizes the depth and
+// numbers of And Gates.
+// For current implementation, we only support 128-bit key security. But user
+// can change the data complexity to achieve higher efficiency.
+//
+// NOTE: Although LowMC is protocol agnostic (only depends on some boolean ops),
+// but we still implement it in each protocol kernel now, for efficiency
+// consideration.
+class LowMcB : public UnaryKernel {
+ public:
+  static constexpr const char* kBindName() { return "lowmc_b"; }
+
+  // the concrete cost depends on the data complexity
+  Kind kind() const override { return Kind::Dynamic; }
+
+  NdArrayRef proc(KernelEvalContext* ctx, const NdArrayRef& in) const override;
+
+  // inner function, mark as public only for testing
+  NdArrayRef encrypt(KernelEvalContext* ctx, const NdArrayRef& in,
+                     uint128_t key, uint128_t seed  // single key now
+  ) const;
+};
+
+// For multi-key condition, we use the scheme in:
+// REF: https://eprint.iacr.org/2019/518
+//
+// If we have m keys, each key has k bits, logically:
+//   1. Concat all these keys and get mk-bits single key `X`.
+//   2. Each party sample the same random binary matrix `M` with shape (mk, n),
+//   where n is the bits that SPU can handle (e.g. 128).
+//   3. Then we compute `Y = gf2dot(X, M)`, and use `Y` as the input for LowMc
+//   encryption.
+//
+// Collision Prob p: about 2^{-n+q}, where q ~= 2 * log2(D), D is the total
+// number of encoding.
+// i.e. when n = 128, D = 2**20 (1M) , p ~= 2^{-88}
+//      when n = 128, D = 2**30 (1B) , p ~= 2^{-68}
+class MultiKeyLowMcB : public MultiKeyLowMcKernel {
+ public:
+  static constexpr const char* kBindName() { return "multi_key_lowmc_b"; }
+
+  // the concrete cost depends on the data complexity
+  Kind kind() const override { return Kind::Dynamic; }
+
+  NdArrayRef proc(KernelEvalContext* ctx,
+                  const std::vector<NdArrayRef>& inputs) const override;
+};
+
+}  // namespace spu::mpc::semi2k
diff --git a/libspu/mpc/semi2k/permute.cc b/libspu/mpc/semi2k/permute.cc
index 71f68ef42..a11424b5d 100644
--- a/libspu/mpc/semi2k/permute.cc
+++ b/libspu/mpc/semi2k/permute.cc
@@ -40,51 +40,58 @@ inline int64_t getOwner(const NdArrayRef& x) {
   return x.eltype().as<Priv2kTy>()->owner();
 }
 
-Index ring2pv(const NdArrayRef& x) {
-  SPU_ENFORCE(x.eltype().isa<Ring2k>(), "must be ring2k_type, got={}",
-              x.eltype());
-  const auto field = x.eltype().as<Ring2k>()->field();
-  Index pv(x.numel());
-  DISPATCH_ALL_FIELDS(field, [&]() {
-    NdArrayView<ring2k_t> _x(x);
-    pforeach(0, x.numel(), [&](int64_t idx) { pv[idx] = int64_t(_x[idx]); });
-  });
-  return pv;
-}
-
 // Secure inverse permutation of x by perm_rank's permutation pv
-// The idea here is:
-// Input permutation pv, beaver generates perm pair {<A>, <B>} that
-// InversePermute(A, pv) = B. So we can get <y> = InversePermute(open(<x> -
-// <A>), pv) + <B> that y = InversePermute(x, pv).
 NdArrayRef SecureInvPerm(KernelEvalContext* ctx, const NdArrayRef& x,
                          const NdArrayRef& perm, size_t perm_rank) {
+  // INPUT: X and private perm owned by perm_rank
   const auto lctx = ctx->lctx();
   const auto field = x.eltype().as<AShrTy>()->field();
+  auto* comm = ctx->getState<Communicator>();
   auto* beaver = ctx->getState<Semi2kState>()->beaver();
   auto numel = x.numel();
 
-  Index pv;
-  if (perm.eltype().isa<PShare>() ||
-      (perm.eltype().isa<Private>() && isOwner(ctx, perm.eltype()))) {
-    pv = ring2pv(perm);
+  if (lctx->Rank() == perm_rank) {
+    SPU_ENFORCE(perm.numel() == numel);
+    SPU_ENFORCE(perm.eltype().isa<PShare>() ||
+                (perm.eltype().isa<Private>() && isOwner(ctx, perm.eltype())));
   }
-  auto [a_buf, b_buf] = beaver->PermPair(field, numel, perm_rank, pv);
+
+  // beaver gives ai, bi, pr makes InvPerm(A, pr) = B
+  // pr is a private random permutation owned by perm_rank.
+  auto [a_buf, b_buf, pr] = beaver->PermPair(field, numel, perm_rank);
+
+  NdArrayRef po;
+  if (lctx->Rank() == perm_rank) {
+    // mask perm by random permutation pr, get po = InvPerm(perm, pr)
+    auto p = std::move(pr);
+    po = applyInvPerm(perm, p);
+    // so: InvPerm(B, po) = InvPerm(InvPerm(A, pr), po) = InvPerm(A, perm)
+  }
+  // broadcast po to all rank.
+  po = comm->broadcast(po, perm_rank, perm.eltype(), perm.shape(),
+                       "perm_open_perm");
 
   NdArrayRef a(std::make_shared<yacl::Buffer>(std::move(a_buf)), x.eltype(),
                x.shape());
   NdArrayRef b(std::make_shared<yacl::Buffer>(std::move(b_buf)), x.eltype(),
                x.shape());
 
-  auto t = wrap_a2v(ctx->sctx(), ring_sub(x, a).as(x.eltype()), perm_rank);
+  // reveal X-A to perm_rank
+  auto x_a = wrap_a2v(ctx->sctx(), ring_sub(x, a).as(x.eltype()), perm_rank);
 
   if (lctx->Rank() == perm_rank) {
-    SPU_ENFORCE(pv.size());
-    ring_add_(b, applyInvPerm(t, pv));
+    // perm_rank get InvPerm(X-A, perm) + InvPerm(bi, po)
+    b = applyInvPerm(b, po);
+    ring_add_(b, applyInvPerm(x_a, perm));
     return b.as(x.eltype());
   } else {
-    return b.as(x.eltype());
+    // others rank get InvPerm(bi, po)
+    return applyInvPerm(b, po).as(x.eltype());
   }
+  // finally get:
+  // InvPerm(X-A, perm) + ∑InvPerm(bi, po) =
+  // InvPerm(X, perm) - InvPerm(A, perm) + InvPerm(B, po) =
+  // InvPerm(X, perm)
 }
 
 }  // namespace
diff --git a/libspu/mpc/semi2k/protocol.cc b/libspu/mpc/semi2k/protocol.cc
index 33d6226b8..cc7c8f0d6 100644
--- a/libspu/mpc/semi2k/protocol.cc
+++ b/libspu/mpc/semi2k/protocol.cc
@@ -21,6 +21,7 @@
 #include "libspu/mpc/semi2k/boolean.h"
 #include "libspu/mpc/semi2k/conversion.h"
 #include "libspu/mpc/semi2k/exp.h"
+#include "libspu/mpc/semi2k/lowmc.h"
 #include "libspu/mpc/semi2k/permute.h"
 #include "libspu/mpc/semi2k/state.h"
 #include "libspu/mpc/semi2k/type.h"
@@ -51,22 +52,24 @@ void regSemi2kProtocol(SPUContext* ctx,
   ctx->prot()->addState<Semi2kState>(ctx->config(), lctx);
   ctx->prot()
       ->regKernel<
-          semi2k::P2A, semi2k::A2P, semi2k::A2V, semi2k::V2A,                //
-          semi2k::NegateA,                                                   //
-          semi2k::AddAP, semi2k::AddAA,                                      //
-          semi2k::MulAP, semi2k::MulAA, semi2k::SquareA,                     //
-          semi2k::MatMulAP, semi2k::MatMulAA,                                //
-          semi2k::LShiftA, semi2k::LShiftB, semi2k::RShiftB,                 //
-          semi2k::ARShiftB,                                                  //
-          semi2k::CommonTypeB, semi2k::CommonTypeV, semi2k::CastTypeB,       //
-          semi2k::B2P, semi2k::P2B,                                          //
-          semi2k::A2B, semi2k::B2A_Randbit, semi2k::B2A_Disassemble,         //
-          semi2k::AndBP, semi2k::AndBB, semi2k::XorBP, semi2k::XorBB,        //
-          semi2k::BitrevB,                                                   //
-          semi2k::BitIntlB, semi2k::BitDeintlB,                              //
-          semi2k::RandA, semi2k::RandPermM, semi2k::PermAM, semi2k::PermAP,  //
-          semi2k::InvPermAM, semi2k::InvPermAP, semi2k::InvPermAV,           //
-          semi2k::EqualAA, semi2k::EqualAP,                                  //
+          semi2k::P2A, semi2k::A2P, semi2k::A2V,
+          semi2k::V2A,                                                  //
+          semi2k::NegateA,                                              //
+          semi2k::AddAP, semi2k::AddAA,                                 //
+          semi2k::MulAP, semi2k::MulAA, semi2k::SquareA,                //
+          semi2k::MatMulAP, semi2k::MatMulAA,                           //
+          semi2k::LShiftA, semi2k::LShiftB, semi2k::RShiftB,            //
+          semi2k::ARShiftB,                                             //
+          semi2k::CommonTypeB, semi2k::CommonTypeV, semi2k::CastTypeB,  //
+          semi2k::B2P, semi2k::P2B,                                     //
+          semi2k::A2B, semi2k::B2A_Randbit, semi2k::B2A_Disassemble,    //
+          semi2k::AndBP, semi2k::AndBB, semi2k::XorBP, semi2k::XorBB,   //
+          semi2k::BitrevB,                                              //
+          semi2k::BitIntlB, semi2k::BitDeintlB,                         //
+          semi2k::RandA, semi2k::RandB,                                 //
+          semi2k::RandPermM, semi2k::PermAM, semi2k::PermAP,            //
+          semi2k::InvPermAM, semi2k::InvPermAP, semi2k::InvPermAV,      //
+          semi2k::EqualAA, semi2k::EqualAP,                             //
           semi2k::BeaverCacheKernel>();
 
   if (ctx->config().trunc_allow_msb_error()) {
@@ -86,6 +89,8 @@ void regSemi2kProtocol(SPUContext* ctx,
     }
   }
   // ctx->prot()->regKernel<semi2k::B2A>();
+  ctx->prot()->regKernel<semi2k::LowMcB>();
+  ctx->prot()->regKernel<semi2k::MultiKeyLowMcB>();
 }
 
 std::unique_ptr<SPUContext> makeSemi2kProtocol(
diff --git a/libspu/mpc/semi2k/protocol_test.cc b/libspu/mpc/semi2k/protocol_test.cc
index eb1a6c604..abf75fffc 100644
--- a/libspu/mpc/semi2k/protocol_test.cc
+++ b/libspu/mpc/semi2k/protocol_test.cc
@@ -18,6 +18,8 @@
 
 #include "gtest/gtest.h"
 #include "yacl/crypto/key_utils.h"
+#include "yacl/crypto/rand/rand.h"
+#include "yacl/utils/elapsed_timer.h"
 
 #include "libspu/mpc/ab_api.h"
 #include "libspu/mpc/ab_api_test.h"
@@ -26,10 +28,13 @@
 #include "libspu/mpc/common/communicator.h"
 #include "libspu/mpc/semi2k/beaver/beaver_impl/ttp_server/beaver_server.h"
 #include "libspu/mpc/semi2k/exp.h"
+#include "libspu/mpc/semi2k/lowmc.h"
 #include "libspu/mpc/semi2k/prime_utils.h"
 #include "libspu/mpc/semi2k/state.h"
 #include "libspu/mpc/semi2k/type.h"
 #include "libspu/mpc/utils/gfmp.h"
+#include "libspu/mpc/utils/lowmc.h"
+#include "libspu/mpc/utils/lowmc_utils.h"
 #include "libspu/mpc/utils/ring_ops.h"
 #include "libspu/mpc/utils/simulate.h"
 
@@ -76,7 +81,8 @@ std::unique_ptr<SPUContext> makeTTPSemi2kProtocol(
   ttp->set_adjust_rank(lctx->WorldSize() - 1);
   ttp->set_server_host(server_host);
   ttp->set_asym_crypto_schema("SM2");
-  ttp->set_server_public_key(key_pair.first.data(), key_pair.first.size());
+  ttp->set_server_public_key(key_pair.first.data<char>(),
+                             key_pair.first.size());
 
   return makeSemi2kProtocol(ttp_rt, lctx);
 }
@@ -554,9 +560,6 @@ TEST_P(BeaverCacheTest, ExpA) {
 
     bytes = lctx->GetStats()->sent_bytes - bytes;
     action = lctx->GetStats()->sent_actions - action;
-    SPDLOG_INFO("ExpA ({}) for n = {}, sent {} MiB ({} B per), actions {}",
-                field, numel, bytes * 1. / 1024. / 1024., bytes * 1. / numel,
-                action);
   });
   assert(outp[0].eltype() == ring2k_shr[0].eltype());
   auto got = ring_add(outp[0], outp[1]);
@@ -571,16 +574,112 @@ TEST_P(BeaverCacheTest, ExpA) {
       expected = static_cast<double>(std::round((expected * (1L << fxp)))) /
                  (1L << fxp);
       double got = static_cast<double>(got_view[i]) / (1L << fxp);
-      // cout left here for future improvement
-      std::cout << "expected: " << fmt::format("{0:f}", expected)
-                << ", got: " << fmt::format("{0:f}", got) << std::endl;
-      std::cout << "expected: "
-                << fmt::format("{0:b}",
-                               static_cast<ring2k_t>(expected * (1L << fxp)))
-                << ", got: " << fmt::format("{0:b}", got_view[i]) << std::endl;
       max_err = std::max(max_err, std::abs(expected - got));
     }
     ASSERT_LE(max_err, 1e-0);
   });
 }
+
+using LowMCTestParams =
+    std::tuple<CreateObjectFn, RuntimeConfig, FieldType, size_t>;
+
+class LowMCTest : public ::testing::TestWithParam<LowMCTestParams> {};
+
+INSTANTIATE_TEST_SUITE_P(
+    Semi2k, LowMCTest,
+    testing::Combine(
+        testing::Values(CreateObjectFn(makeSemi2kProtocol, "tfp"),
+                        CreateObjectFn(makeTTPSemi2kProtocol,
+                                       "ttp")),         // TFP or TTP
+        testing::Values(makeConfig(FieldType::FM32),    // Global Field
+                        makeConfig(FieldType::FM64),    //
+                        makeConfig(FieldType::FM128)),  //
+        testing::Values(FM32, FM64, FM128),             // LowMC runtime Field
+        testing::Values(2)),                            // npc
+    [](const testing::TestParamInfo<LowMCTest::ParamType>& p) {
+      return fmt::format("{}x{}x{}x{}", std::get<0>(p.param).name(),
+                         std::get<1>(p.param).field(), std::get<2>(p.param),
+                         std::get<3>(p.param));
+      ;
+    });
+
+TEST_P(LowMCTest, EncryptCorrect) {
+  const auto factory = std::get<0>(GetParam());
+  const RuntimeConfig& conf = std::get<1>(GetParam());
+
+  // Global Field can be different from LowMC runtime Field
+  const auto field = std::get<2>(GetParam());
+  const size_t npc = std::get<3>(GetParam());
+
+  const Shape shape = {10, 5};
+  // const Shape shape = {1000, 1000};
+
+  const auto bty = makeType<spu::mpc::semi2k::BShrTy>(field);
+  const auto numel = shape.numel();
+
+  // sharing of x
+  NdArrayRef x[2];
+  x[0] = ring_rand(field, shape).as(bty);
+  x[1] = ring_rand(field, shape).as(bty);
+  auto pub_x = ring_xor(x[0], x[1]);
+
+  // sharing of key
+  uint128_t key[2];
+  key[0] = yacl::crypto::SecureRandSeed();
+  key[1] = yacl::crypto::SecureRandSeed();
+  auto pub_key = key[0] ^ key[1];
+
+  uint128_t seed = 0;
+
+  NdArrayRef out[2];
+  utils::simulate(npc, [&](const std::shared_ptr<yacl::link::Context>& lcxt) {
+    auto obj = factory(conf, lcxt);
+    KernelEvalContext kcontext(obj.get());
+
+    int rank = lcxt->Rank();
+
+    // test for kernel registration
+    SPU_ENFORCE(obj->hasKernel("lowmc_b"));
+    spu::mpc::semi2k::LowMcB cipher;
+
+    size_t b0 = lcxt->GetStats()->sent_bytes;
+    size_t r0 = lcxt->GetStats()->sent_actions;
+    yacl::ElapsedTimer pack_timer;
+
+    // To test the correctness, we use the inner api
+    out[rank] = cipher.encrypt(&kcontext, x[rank], key[rank], seed);
+
+    double pack_time = pack_timer.CountMs() * 1.0;
+    size_t b1 = lcxt->GetStats()->sent_bytes;
+    size_t r1 = lcxt->GetStats()->sent_actions;
+
+    SPDLOG_INFO(
+        "LowMC ({}) for n = {}, elapsed {} ms, sent {} MiB ({} B per), "
+        "actions {}.",
+        field, numel, pack_time, (b1 - b0) * 1. / 1024. / 1024.,
+        (b1 - b0) * 1. / numel, r1 - r0);
+  });
+
+  SPU_ENFORCE(out[0].eltype().isa<semi2k::BShrTy>());
+  SPU_ENFORCE(out[1].eltype().isa<semi2k::BShrTy>());
+
+  auto got = ring_xor(out[0], out[1]);
+  DISPATCH_ALL_FIELDS(field, [&]() {  //
+    NdArrayView<ring2k_t> _got(got);
+
+    auto block_cipher = LowMC(field, seed, get_data_complexity(numel));
+    block_cipher.set_key(pub_key);
+
+    auto c = block_cipher.encrypt(pub_x);
+    NdArrayView<ring2k_t> _exp(c);
+
+    for (int64_t i = 0; i < numel; ++i) {
+      auto got_val = _got[i];
+      auto exp_val = _exp[i];
+
+      EXPECT_EQ(got_val, exp_val);
+    }
+  });
+}
+
 }  // namespace spu::mpc::test
diff --git a/libspu/mpc/semi2k/state.h b/libspu/mpc/semi2k/state.h
index f9b394ca5..2e88c9f69 100644
--- a/libspu/mpc/semi2k/state.h
+++ b/libspu/mpc/semi2k/state.h
@@ -50,7 +50,22 @@ class Semi2kState : public State {
         const auto& key = conf.ttp_beaver_config().server_public_key();
         ops.server_public_key = yacl::Buffer(key.data(), key.size());
       }
-      // TODO: TLS & brpc options.
+      if (!conf.ttp_beaver_config().transport_protocol().empty()) {
+        ops.brpc_channel_protocol =
+            conf.ttp_beaver_config().transport_protocol();
+      }
+      if (conf.ttp_beaver_config().has_ssl_config()) {
+        brpc::ChannelSSLOptions ssl_options;
+        ssl_options.verify.ca_file_path =
+            conf.ttp_beaver_config().ssl_config().ca_file_path();
+        ssl_options.verify.verify_depth =
+            conf.ttp_beaver_config().ssl_config().verify_depth();
+        ssl_options.client_cert.certificate =
+            conf.ttp_beaver_config().ssl_config().certificate();
+        ssl_options.client_cert.private_key =
+            conf.ttp_beaver_config().ssl_config().private_key();
+        ops.brpc_ssl_options = std::move(ssl_options);
+      }
       beaver_ = std::make_unique<semi2k::BeaverTtp>(lctx, std::move(ops));
     } else {
       SPU_THROW("unsupported beaver type {}", conf.beaver_type());
diff --git a/libspu/mpc/spdz2k/BUILD.bazel b/libspu/mpc/spdz2k/BUILD.bazel
index fb1ab48ae..855ece0ad 100644
--- a/libspu/mpc/spdz2k/BUILD.bazel
+++ b/libspu/mpc/spdz2k/BUILD.bazel
@@ -197,7 +197,7 @@ spu_cc_library(
         "//libspu/mpc/common:communicator",
         "//libspu/mpc/common:pv2k",
         "//libspu/mpc/utils:simulate",
-        "@com_google_googletest//:gtest",
+        "@googletest//:gtest",
     ],
     alwayslink = True,
 )
diff --git a/libspu/mpc/spdz2k/beaver/BUILD.bazel b/libspu/mpc/spdz2k/beaver/BUILD.bazel
index b7f4d80e7..9846a4709 100644
--- a/libspu/mpc/spdz2k/beaver/BUILD.bazel
+++ b/libspu/mpc/spdz2k/beaver/BUILD.bazel
@@ -35,7 +35,7 @@ spu_cc_library(
         "//libspu/mpc/common:prg_tensor",
         "//libspu/mpc/spdz2k:commitment",
         "//libspu/mpc/utils:ring_ops",
-        "@com_github_microsoft_seal//:seal",
+        "@seal",
         "@yacl//yacl/crypto/block_cipher:symmetric_crypto",
         "@yacl//yacl/crypto/tools:prg",
         "@yacl//yacl/link",
@@ -51,7 +51,7 @@ spu_cc_test(
         ":beaver_tfp",
         ":beaver_tinyot",
         "//libspu/mpc/utils:simulate",
-        "@com_google_googletest//:gtest",
+        "@googletest//:gtest",
     ],
 )
 
diff --git a/libspu/mpc/spdz2k/ot/BUILD.bazel b/libspu/mpc/spdz2k/ot/BUILD.bazel
index 124adaf10..937eb3c18 100644
--- a/libspu/mpc/spdz2k/ot/BUILD.bazel
+++ b/libspu/mpc/spdz2k/ot/BUILD.bazel
@@ -42,7 +42,7 @@ spu_cc_library(
     copts = AES_COPT_FLAGS + ["-Wno-ignored-attributes"],
     deps = [
         "//libspu/core:prelude",
-        "@com_github_emptoolkit_emp_tool//:emp-tool",
+        "@emp-tool//:emp-tool",
         "@yacl//yacl/crypto/hash:hash_interface",
         "@yacl//yacl/crypto/hash:hash_utils",
         "@yacl//yacl/crypto/tools:crhash",
@@ -66,7 +66,7 @@ spu_cc_library(
         "//libspu/mpc/spdz2k:commitment",
         "//libspu/mpc/spdz2k/ot:kos_ote",
         "//libspu/mpc/utils:ring_ops",
-        "@com_github_emptoolkit_emp_tool//:emp-tool",
+        "@emp-tool//:emp-tool",
         "@yacl//yacl/crypto/tools:prg",
         "@yacl//yacl/kernel/type:ot_store",
         "@yacl//yacl/link",
diff --git a/libspu/mpc/tools/BUILD.bazel b/libspu/mpc/tools/BUILD.bazel
index 2d2892373..5a430b2e2 100644
--- a/libspu/mpc/tools/BUILD.bazel
+++ b/libspu/mpc/tools/BUILD.bazel
@@ -58,9 +58,9 @@ spu_cc_binary(
         "//libspu/mpc/common:communicator",
         "//libspu/mpc/semi2k",
         "//libspu/mpc/utils:simulate",
-        "@com_github_fmtlib_fmt//:fmtlib",
-        "@com_github_google_benchmark//:benchmark",
-        "@com_google_absl//absl/strings",
+        "@abseil-cpp//absl/strings",
+        "@fmt",
+        "@google_benchmark//:benchmark",
         "@llvm-project//llvm:Support",
         "@yacl//yacl/link:context",
     ],
diff --git a/libspu/mpc/utils/BUILD.bazel b/libspu/mpc/utils/BUILD.bazel
index 2b494ff1d..3ea28e9b3 100644
--- a/libspu/mpc/utils/BUILD.bazel
+++ b/libspu/mpc/utils/BUILD.bazel
@@ -126,7 +126,7 @@ spu_cc_binary(
     srcs = ["ring_ops_bench.cc"],
     deps = [
         ":ring_ops",
-        "@com_github_google_benchmark//:benchmark",
+        "@google_benchmark//:benchmark",
     ],
 )
 
@@ -138,7 +138,7 @@ spu_cc_library(
     linkopts = OMP_LINKFLAGS,
     deps = [
         "//libspu/core:parallel_utils",
-        "@eigen_archive//:eigen3",
+        "@eigen",
     ] + OMP_DEPS,
 )
 
@@ -158,3 +158,36 @@ spu_cc_library(
         "//libspu/core:parallel_utils",
     ],
 )
+
+spu_cc_library(
+    name = "lowmc",
+    srcs = ["lowmc.cc"],
+    hdrs = ["lowmc.h"],
+    deps = [
+        ":lowmc_utils",
+        "//libspu/core:ndarray_ref",
+        "//libspu/mpc/utils:ring_ops",
+        "@yacl//yacl/crypto/tools:prg",
+    ],
+)
+
+spu_cc_library(
+    name = "lowmc_utils",
+    srcs = ["lowmc_utils.cc"],
+    hdrs = ["lowmc_utils.h"],
+    deps = [
+        "//libspu/core:ndarray_ref",
+        "//libspu/core:prelude",
+        "//libspu/mpc/utils:ring_ops",
+    ],
+)
+
+spu_cc_test(
+    name = "lowmc_test",
+    srcs = ["lowmc_test.cc"],
+    deps = [
+        ":lowmc",
+        "//libspu/mpc/utils:ring_ops",
+        "@yacl//yacl/utils:elapsed_timer",
+    ],
+)
diff --git a/libspu/mpc/utils/lowmc.cc b/libspu/mpc/utils/lowmc.cc
new file mode 100644
index 000000000..a4b37e500
--- /dev/null
+++ b/libspu/mpc/utils/lowmc.cc
@@ -0,0 +1,372 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libspu/mpc/utils/lowmc.h"
+
+#include "libspu/mpc/utils/lowmc_utils.h"
+#include "libspu/mpc/utils/ring_ops.h"
+
+namespace spu::mpc {
+
+namespace {
+
+template <typename T>
+bool get_bit(const T x, int i) {
+  return (x >> i) & (1);
+}
+
+// Some linear algebra helper functions
+uint64_t rank_of_matrix(const NdArrayRef& matrix) {
+  SPU_ENFORCE(matrix.shape().size() == 1, "matrix should be a 1-D array");
+
+  const auto n_rows = static_cast<uint64_t>(matrix.numel());
+  auto mat = matrix.clone();
+  const auto field = mat.eltype().as<RingTy>()->field();
+
+  // Do Gaussian elimination, and count the non-zero rows
+  uint64_t row = 0;
+
+  DISPATCH_ALL_FIELDS(field, [&]() {
+    using block_type = ring2k_t;
+    NdArrayView<block_type> _mat(mat);
+
+    // can be `block_size_` or `key_size_`, column size of matrix
+    const auto size = sizeof(block_type) * 8;
+    const auto max_rank = std::min(n_rows, size);
+
+    // we try to transform matrix to its upper triangular form
+    for (uint64_t col = 1; col <= size; ++col) {
+      // if the pivot is zero, then find the first non-zero row and swap it
+      if (!get_bit(_mat[row], size - col)) {
+        uint64_t r = row;
+        while (r < n_rows && !get_bit(_mat[r], size - col)) {
+          ++r;
+        }
+        // all rows in this column are zero, skip it
+        if (r >= n_rows) {
+          continue;
+        } else {
+          auto temp = _mat[row];
+          _mat[row] = _mat[r];
+          _mat[r] = temp;
+        }
+      }
+      for (uint64_t i = row + 1; i < n_rows; ++i) {
+        if (get_bit(_mat[i], size - col)) {
+          _mat[i] ^= _mat[row];
+        }
+      }
+      ++row;
+      if (row == max_rank) {
+        break;
+      }
+    }
+  });
+
+  return row;
+}
+
+// Computing the inv of matrix without checking the rank of matrix by
+// Gaussian elimination algorithm: [M | I] -> [I | inv(M)]
+NdArrayRef invert_matrix(const NdArrayRef& matrix) {
+  SPU_ENFORCE(matrix.shape().size() == 1, "matrix should be a 1-D array");
+
+  const auto n_rows = static_cast<uint64_t>(matrix.numel());
+  auto mat = matrix.clone();
+  const auto field = mat.eltype().as<RingTy>()->field();
+
+  auto inv_mat = NdArrayRef(matrix.eltype(), matrix.shape());
+
+  DISPATCH_ALL_FIELDS(field, [&]() {
+    using block_type = ring2k_t;
+    const auto size = sizeof(block_type) * 8;
+    SPU_ENFORCE(n_rows == size, "Not a square matrix.");
+
+    NdArrayView<block_type> _mat(mat);
+    NdArrayView<block_type> _inv_mat(inv_mat);
+
+    // init inv_mat as identity matrix
+    pforeach(0, n_rows, [&](int64_t idx) {  //
+      _inv_mat[idx] = (static_cast<block_type>(1) << idx);
+    });
+
+    // Transform to upper triangular matrix first
+    uint64_t row = 0;
+    for (uint64_t col = 0; col < size; ++col) {
+      // if the pivot is zero, then find the first non-zero row and swap it
+      if (!get_bit(_mat[row], col)) {
+        uint64_t r = row + 1;
+        while (r < n_rows && !get_bit(_mat[r], col)) {
+          ++r;
+        }
+        if (r >= n_rows) {
+          continue;
+        } else {
+          auto temp = _mat[row];
+          _mat[row] = _mat[r];
+          _mat[r] = temp;
+
+          temp = _inv_mat[row];
+          _inv_mat[row] = _inv_mat[r];
+          _inv_mat[r] = temp;
+        }
+      }
+      for (uint64_t i = row + 1; i < n_rows; ++i) {
+        if (get_bit(_mat[i], col)) {
+          _mat[i] ^= _mat[row];
+          _inv_mat[i] ^= _inv_mat[row];
+        }
+      }
+      ++row;
+    }
+
+    // Transform to identity matrix
+    for (uint64_t col = size; col > 0; --col) {
+      for (uint64_t r = 0; r < col - 1; ++r) {
+        if (get_bit(_mat[r], col - 1)) {
+          _mat[r] ^= _mat[col - 1];
+          _inv_mat[r] ^= _inv_mat[col - 1];
+        }
+      }
+    }
+  });
+
+  return inv_mat;
+}
+
+}  // namespace
+
+/// public api implementation
+
+LowMC::LowMC(FieldType field, uint128_t seed, int64_t d, uint64_t key_size,
+             bool need_decrypt) {
+  SPU_ENFORCE(key_size == 128, "key size should always be 128 now");
+
+  int64_t n_boxes;
+  int64_t rounds;
+  if (field == FM32) {
+    SPU_ENFORCE(d < 32,
+                "Support at most 2^32 blocks to encrypt for 32-bit blocks.");
+    // d=20 or d=30 has the same parameter setting.
+    n_boxes = 9;
+    rounds = 15;
+  } else if (field == FM64) {
+    switch (d) {
+      case 20:
+        n_boxes = 15;
+        rounds = 11;
+        break;
+      case 30:
+        n_boxes = 13;
+        rounds = 12;
+        break;
+      case 40:
+        n_boxes = 13;
+        rounds = 13;
+        break;
+      default:
+        SPU_THROW("Not supported data complexity.");
+    }
+  } else if (field == FM128) {
+    switch (d) {
+      case 20:
+        n_boxes = 25;
+        rounds = 10;
+        break;
+      case 30:
+        n_boxes = 25;
+        rounds = 11;
+        break;
+      case 40:
+        n_boxes = 25;
+        rounds = 12;
+        break;
+      default:
+        SPU_THROW("Not supported data complexity.");
+    }
+  } else {
+    SPU_THROW("Should not be here.");
+  }
+
+  field_ = field;
+  seed_ = seed;
+  number_of_boxes_ = n_boxes;
+  rounds_ = rounds;
+  key_size_ = key_size;
+  need_decrypt_ = need_decrypt;
+  block_size_ = SizeOf(field) * 8;
+  SPU_ENFORCE(block_size_ <= 128,
+              "data size should be no more than 128 bits now.");
+
+  // S-boxes of LowMC has 3 bits
+  identity_size_ = block_size_ - number_of_boxes_ * kSboxBits;
+
+  // fill some key-irrelevant random matrixes
+  fill_matrixes(need_decrypt);
+}
+
+void LowMC::set_key(KeyType key) {
+  if (key_been_set_) {
+    return;
+  }
+
+  round_keys_ = generate_round_keys(key_matrices_, key, rounds_, field_);
+  key_been_set_ = true;
+}
+
+NdArrayRef LowMC::encrypt(const NdArrayRef& plaintext) {
+  SPU_ENFORCE(key_been_set_, "key not set.");
+  SPU_ENFORCE(plaintext.eltype().as<RingTy>()->field() == field_,
+              "field mismatch");
+  const auto& shape = plaintext.shape();
+
+  // 1. key whiten
+  auto c = ring_xor(plaintext, round_keys_[0].broadcast_to(shape, {}));
+
+  // 2. round loop
+  for (uint64_t r = 1; r <= rounds_; r++) {
+    // S-boxes
+    c = Substitution(c, kSBox);
+
+    // affine layer
+    c = dot_product_gf2(lin_matrices_[r - 1], c, field_);
+    ring_xor_(c, round_constants_[r - 1].broadcast_to(shape, {}));
+
+    // round key xor
+    ring_xor_(c, round_keys_[r].broadcast_to(shape, {}));
+  }
+
+  return c;
+}
+
+NdArrayRef LowMC::decrypt(const NdArrayRef& ciphertext) {
+  SPU_ENFORCE(key_been_set_, "key not set.");
+  SPU_ENFORCE(ciphertext.eltype().as<RingTy>()->field() == field_,
+              "field mismatch");
+  const auto& shape = ciphertext.shape();
+
+  // just the inverse procedure of encrypt
+  auto c = ciphertext;
+  for (uint64_t r = rounds_; r > 0; r--) {
+    ring_xor_(c, round_keys_[r].broadcast_to(shape, {}));
+
+    ring_xor_(c, round_constants_[r - 1].broadcast_to(shape, {}));
+    c = dot_product_gf2(inv_lin_matrices_[r - 1], c, field_);
+    c = Substitution(c, kInvSBox);
+  }
+
+  ring_xor_(c, round_keys_[0].broadcast_to(shape, {}));
+
+  return c;
+}
+
+/// private api implementation
+
+NdArrayRef LowMC::Substitution(const NdArrayRef& data,
+                               absl::Span<uint64_t const> sbox) const {
+  NdArrayRef ret(data.eltype(), data.shape());
+
+  DISPATCH_ALL_FIELDS(ret.eltype().as<RingTy>()->field(), [&]() {
+    using block_type = ring2k_t;
+    NdArrayView<block_type> _data(data);
+    NdArrayView<block_type> _ret(ret);
+
+    pforeach(0, data.numel(), [&](int64_t idx) {
+      block_type tmp = 0;
+
+      // Get the identity part of the data
+      tmp ^= (_data[idx] >> (3 * number_of_boxes_));
+
+      // Get the rest through the Sboxes
+      for (uint64_t i = 1; i <= number_of_boxes_; ++i) {
+        tmp <<= 3;
+        auto ind = ((_data[idx] >> 3 * (number_of_boxes_ - i)) & 0x7);
+        tmp ^= static_cast<block_type>(sbox[ind]);
+      }
+
+      _ret[idx] = tmp;
+    });
+  });
+
+  return ret;
+}
+
+void LowMC::fill_matrixes(bool need_decrypt) {
+  // 1. create Lmatrixes
+  lin_matrices_.reserve(rounds_);
+  // -1 means no rank checking
+  int64_t desire_rank = -1;
+  if (need_decrypt) {
+    inv_lin_matrices_.reserve(rounds_);
+    // Note: we force block_size_ <= key_size_ = 128, so we can just use the
+    // same ranks for all Lmatices and key matrices.
+    desire_rank = block_size_;
+  }
+
+  for (uint64_t i = 0; i < rounds_; i++) {
+    auto mat = get_pub_rand_blocks(field_, block_size_, desire_rank);
+    lin_matrices_.push_back(mat);
+
+    if (need_decrypt) {
+      inv_lin_matrices_.push_back(invert_matrix(mat));
+    }
+  }
+
+  // 2. create round constants
+  round_constants_.reserve(rounds_);
+  for (uint64_t i = 0; i < rounds_; i++) {
+    round_constants_.push_back(get_pub_rand_blocks(field_, 1));
+  }
+
+  // 3. create key matrices
+  key_matrices_.reserve(rounds_ + 1);  // first element is for initial whiten
+  for (uint64_t i = 0; i < rounds_ + 1; i++) {
+    // we force the key_size = 128 for safety consideration.
+    key_matrices_.push_back(
+        get_pub_rand_blocks(FM128, block_size_, desire_rank));
+  }
+}
+
+NdArrayRef LowMC::replay_ring_rand(FieldType field, const Shape& shape) {
+  NdArrayRef res(makeType<RingTy>(field), shape);
+
+  cnt_ = yacl::crypto::FillPRand(
+      kCryptoType, seed_, iv_, cnt_,
+      absl::MakeSpan(res.data<char>(), res.buf()->size()));
+
+  return res;
+}
+
+NdArrayRef LowMC::get_pub_rand_blocks(FieldType field, int64_t n_blocks,
+                                      int64_t desire_rank) {
+  const auto ring_ty = makeType<RingTy>(field);
+  auto rand = replay_ring_rand(field, {n_blocks});
+
+  // check the rank for the inverse process (debug only now)
+  if (desire_rank > 0) {
+    // The simple constant rounds algorithm to generate invertible or full
+    // row-rank matrixes:
+    // e.g. For nxn matrices M, we just fill M
+    // with random bits, and it's not hard to prove that: P(det(M) != 0) =
+    // (1-1/2) * (1-1/4) * ... * (1-1/2^n) ~= 0.2888 (when n->inf), so the
+    // expected repeat times are no more than 4.
+    while (rank_of_matrix(rand) != static_cast<uint64_t>(desire_rank)) {
+      rand = replay_ring_rand(field, {n_blocks});
+    }
+  }
+
+  return rand.as(ring_ty);
+}
+
+}  // namespace spu::mpc
diff --git a/libspu/mpc/utils/lowmc.h b/libspu/mpc/utils/lowmc.h
new file mode 100644
index 000000000..dfd920bf5
--- /dev/null
+++ b/libspu/mpc/utils/lowmc.h
@@ -0,0 +1,132 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "yacl/crypto/tools/prg.h"
+
+#include "libspu/core/ndarray_ref.h"
+
+namespace spu::mpc {
+
+// ref: Ciphers for MPC and FHE
+// https://eprint.iacr.org/2016/687.pdf
+class LowMC {
+  using KeyType = uint128_t;  // key size should always be 128, at least >= 80
+
+ public:
+  // To prevent the user to set the wrong parameters, we decide the inner
+  // parameters of LowMC insides. These parameters are deduced from the 5
+  // attacks in the origin LowMC paper.
+  // Note: currently, we only support encryption functionality.
+  //
+  // d: data complexity, the log2 of the numbers of data_blocks
+  explicit LowMC(FieldType field, uint128_t seed, int64_t d,
+                 uint64_t key_size = 128, bool need_decrypt = false);
+
+  // plaintext set key procedure, debug only
+  void set_key(KeyType key);
+
+  ///
+  /// encrypt/decrypt api for plaintext data, debug only now
+  ///
+
+  NdArrayRef encrypt(const NdArrayRef& plaintext);
+
+  NdArrayRef decrypt(const NdArrayRef& ciphertext);
+
+  std::vector<NdArrayRef> Lmat() const { return lin_matrices_; }
+
+  std::vector<NdArrayRef> RoundConstants() const { return round_constants_; }
+
+  std::vector<NdArrayRef> Kmat() const { return key_matrices_; }
+
+  int64_t rounds() const { return rounds_; }
+
+  int64_t number_of_boxes() const { return number_of_boxes_; }
+
+  int64_t data_block_size() const { return block_size_; }
+
+ private:
+  // utils functions
+
+  // S-boxes implementation with lookup table
+  NdArrayRef Substitution(const NdArrayRef& data,
+                          absl::Span<uint64_t const> sbox) const;
+
+  // key filling functions
+  void fill_matrixes(bool need_decrypt);
+
+  // random blocks helper functions
+  // generate public and replay rand array.
+  NdArrayRef replay_ring_rand(FieldType field, const Shape& shape);
+
+  // Note: To save memory, we compress k bits into a single uint64_t
+  // or uint128_t number. So for n*k binary matrixes, we store it with an
+  // shape (n,) NdArrayRef, each element (k bits) is a row of matrix.
+  NdArrayRef get_pub_rand_blocks(FieldType field, int64_t n_blocks,
+                                 int64_t desire_rank = -1);
+
+  // some meta infos of the lowmc
+  static constexpr int kSboxBits = 3;
+  uint64_t block_size_;       // Data size in bits
+  FieldType field_;           // field of data block
+  uint64_t number_of_boxes_;  // Number of S-boxes in each round
+  uint64_t identity_size_;    // Size of the identity part in the Sbox layer
+  uint64_t key_size_;         // Key size in bits
+  uint64_t rounds_;
+  bool need_decrypt_;
+  bool key_been_set_ = false;
+
+  // random values related
+  uint128_t seed_;  // seed to generate random matrixes and keys
+  static constexpr yacl::crypto::SymmetricCrypto::CryptoType kCryptoType =
+      yacl::crypto::SymmetricCrypto::CryptoType::AES128_ECB;
+  uint128_t iv_ = 0;
+  uint64_t cnt_ = 0;
+
+  // inner matrixes and keys
+  // Stores the binary matrices for each round.
+  // each array, shape: (block_size_,)
+  // each element is a ROW of matrix, i.e. block_size_ bits
+  std::vector<NdArrayRef> lin_matrices_;
+  // Stores the round constants
+  // each array, shape: (1,)
+  // each element is block_size_ bits
+  std::vector<NdArrayRef> round_constants_;
+  // Stores the matrices that generate the round keys
+  // each array, shape: (block_size_,)
+  // each element is a ROW of matrix, i.e. key_size_ bits
+  std::vector<NdArrayRef> key_matrices_;
+  // Stores the round keys
+  // each array, shape: (1,)
+  // each element is block_size_ bits
+  std::vector<NdArrayRef> round_keys_;
+
+  // some matrixes for decrypt, valid only for testing
+  // Stores the inverses of LinMatrices
+  // each array, shape: (block_size_,)
+  // each element is a ROW of matrix, i.e. block_size_ bits
+  std::vector<NdArrayRef> inv_lin_matrices_;
+
+  // The Sbox and its inverse
+  // The plaintext implementations of the Sbox and its inverse are based on
+  // Look-Up tables.
+  static constexpr std::array<uint64_t, 8> kSBox = {0x00, 0x01, 0x03, 0x06,
+                                                    0x07, 0x04, 0x05, 0x02};
+  static constexpr std::array<uint64_t, 8> kInvSBox = {0x00, 0x01, 0x07, 0x02,
+                                                       0x05, 0x06, 0x03, 0x04};
+};
+
+}  // namespace spu::mpc
diff --git a/libspu/mpc/utils/lowmc_test.cc b/libspu/mpc/utils/lowmc_test.cc
new file mode 100644
index 000000000..ae1bdbadb
--- /dev/null
+++ b/libspu/mpc/utils/lowmc_test.cc
@@ -0,0 +1,76 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libspu/mpc/utils/lowmc.h"
+
+#include "gtest/gtest.h"
+#include "yacl/utils/elapsed_timer.h"
+
+#include "libspu/mpc/utils/ring_ops.h"
+
+namespace spu::mpc {
+
+TEST(LowMC, List) {
+  uint128_t seed = 107;
+  uint128_t key = 11;
+  int64_t d = 20;  // data complexity
+  int64_t n = 100;
+  const Shape shape = {n, n};
+
+  // 64-bits block
+  {
+    FieldType field = FM64;
+
+    yacl::ElapsedTimer pack_timer;
+    auto cipher = LowMC(field, seed, d, 128, true);
+    double init_time = pack_timer.CountMs();
+
+    cipher.set_key(key);
+
+    auto values = ring_rand(field, shape);
+
+    auto c = cipher.encrypt(values);
+
+    auto p = cipher.decrypt(c);
+
+    SPDLOG_INFO("{} blocks, {}-bits block, fill random {} ms", shape.numel(),
+                64, init_time);
+
+    EXPECT_TRUE(ring_all_equal(values, p));
+  }
+
+  // 128-bits block
+  {
+    FieldType field = FM128;
+
+    yacl::ElapsedTimer pack_timer;
+    auto cipher = LowMC(field, seed, d, 128, true);
+    double init_time = pack_timer.CountMs();
+
+    cipher.set_key(key);
+
+    auto values = ring_rand(field, shape);
+
+    auto c = cipher.encrypt(values);
+
+    auto p = cipher.decrypt(c);
+
+    SPDLOG_INFO("{} blocks, {}-bits block, fill random {} ms", shape.numel(),
+                128, init_time);
+
+    EXPECT_TRUE(ring_all_equal(values, p));
+  }
+}
+
+}  // namespace spu::mpc
diff --git a/libspu/mpc/utils/lowmc_utils.cc b/libspu/mpc/utils/lowmc_utils.cc
new file mode 100644
index 000000000..2354cf03d
--- /dev/null
+++ b/libspu/mpc/utils/lowmc_utils.cc
@@ -0,0 +1,117 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libspu/mpc/utils/lowmc_utils.h"
+
+#include "libspu/core/prelude.h"
+#include "libspu/mpc/utils/ring_ops.h"
+
+namespace spu::mpc {
+
+namespace {
+
+template <typename T>
+constexpr T bit_parity(const T x) {
+  static_assert(std::is_unsigned_v<T>);
+
+  auto k = sizeof(T) * 8;
+  T ret = x;
+  while (k > 1) {
+    ret ^= (ret >> (k / 2));
+    k /= 2;
+  }
+
+  return ret & 1;
+}
+
+}  // namespace
+
+NdArrayRef dot_product_gf2(const NdArrayRef& x, const NdArrayRef& y,
+                           FieldType to_field) {
+  // conceptually, x is an n*k binary matrix, y is a m*k binary matrix (y can
+  // be multi-dimension, we take 2-d as an example);
+  // ret is a m*n binary matrix, ret[i] = dot(x, y[i]);
+  // IMPORTANT: the field of (x,y) and ret may be different!
+  SPU_ENFORCE(x.elsize() == y.elsize(), "size mismatch");
+  SPU_ENFORCE(x.shape().size() == 1,
+              "x should be a 1-D array, i.e. n*k binary matrix.");
+
+  const auto field = x.eltype().as<RingTy>()->field();
+  const auto n = x.shape().dim(0);
+  SPU_ENFORCE(SizeOf(to_field) * 8 == (uint64_t)n,
+              "mismatch of output bit size and type.");
+
+  auto out = ring_zeros(to_field, y.shape());
+
+  DISPATCH_ALL_FIELDS(field, [&]() {
+    using src_type = ring2k_t;
+
+    DISPATCH_ALL_FIELDS(to_field, [&]() {
+      using to_type = ring2k_t;
+
+      NdArrayView<to_type> _out(out);
+
+      Index ind(1, 0);
+      for (int64_t i = 0; i < n; ++i) {
+        ind[0] = i;
+        const auto row = x.slice_scalar_at(ind).broadcast_to(y.shape(), {});
+        auto prod = ring_and(y, row);
+        NdArrayView<src_type> _prod(prod);
+
+        pforeach(0, out.numel(), [&](int64_t idx) {  //
+          _out[idx] =
+              _out[idx] | (static_cast<to_type>(bit_parity(_prod[idx])) << i);
+        });
+      }
+    });
+  });
+
+  return out;
+}
+
+std::vector<NdArrayRef> generate_round_keys(
+    const std::vector<NdArrayRef>& key_matrices, uint128_t key, uint64_t rounds,
+    FieldType to_field) {
+  SPU_ENFORCE(key_matrices.size() == (rounds + 1), "key matrix size mismatch");
+
+  NdArrayRef master_key(makeType<RingTy>(FM128), {1});
+  NdArrayView<uint128_t> _master_key(master_key);
+  _master_key[0] = key;
+
+  std::vector<NdArrayRef> round_keys;
+  round_keys.reserve(rounds + 1);
+  // round keys has rounds + 1 elements, the first one is for initial whiten
+  for (uint64_t i = 0; i <= rounds; ++i) {
+    round_keys.push_back(
+        dot_product_gf2(key_matrices[i], master_key, to_field));
+  }
+
+  return round_keys;
+}
+
+int64_t get_data_complexity(int64_t n) {
+  const auto n_bits = Log2Ceil(n);
+
+  if (n_bits <= 20) {
+    return 20;
+  } else if (n_bits <= 30) {
+    return 30;
+  } else if (n_bits <= 40) {
+    return 40;
+  }
+
+  SPU_THROW("Support at most 2^40 now.");
+}
+
+}  // namespace spu::mpc
diff --git a/libspu/mpc/utils/lowmc_utils.h b/libspu/mpc/utils/lowmc_utils.h
new file mode 100644
index 000000000..c0bd71c21
--- /dev/null
+++ b/libspu/mpc/utils/lowmc_utils.h
@@ -0,0 +1,43 @@
+// Copyright 2024 Ant Group Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "libspu/core/ndarray_ref.h"
+
+namespace spu::mpc {
+
+// recap: For n*k binary matrix, we regard it as a (n,) NdArrayRef, each
+// element is a row (k bits).
+// For n*k binary matrix A, k-bits binary vector B, n-bits C = dot(A, B):
+// C[r] = bit_parity(A[r] & B) for r in [0, n)
+NdArrayRef dot_product_gf2(const NdArrayRef& x, const NdArrayRef& y,
+                           FieldType to_field);
+
+// Key is strongly dependent on the sharing semantics, so we leave the key
+// setting procedure in kernel layer.
+// Here we implement the plaintext scheme, which can also be used in n-n
+// xor sharing semantics (e.g. SEMI2K, CHEETAH, etc.).
+// For ABY3, can call this function twice to get two sharing of the round
+// keys.
+std::vector<NdArrayRef> generate_round_keys(
+    const std::vector<NdArrayRef>& key_matrices, uint128_t key, uint64_t rounds,
+    FieldType to_field);
+
+// we only support three choices for data complexity now.
+// n <= 2^20 (about 1 million); n <= 2^30 (about 1 billion); n <= 2^40
+// (about 1 trillion)
+int64_t get_data_complexity(int64_t n);
+
+}  // namespace spu::mpc
diff --git a/libspu/mpc/utils/permute.cc b/libspu/mpc/utils/permute.cc
index 62a8e85a3..800fa1dfa 100644
--- a/libspu/mpc/utils/permute.cc
+++ b/libspu/mpc/utils/permute.cc
@@ -20,6 +20,7 @@
 #include "yacl/crypto/rand/rand.h"
 
 #include "libspu/core/ndarray_ref.h"
+#include "libspu/core/parallel_utils.h"
 #include "libspu/core/type_util.h"
 
 namespace spu::mpc {
@@ -44,9 +45,7 @@ NdArrayRef applyInvPerm(const NdArrayRef& x, absl::Span<const int64_t> pv) {
   DISPATCH_ALL_FIELDS(field, [&]() {
     NdArrayView<ring2k_t> _x(x);
     NdArrayView<ring2k_t> _y(y);
-    for (int64_t i = 0; i < y.numel(); i++) {
-      _y[pv[i]] = _x[i];
-    }
+    pforeach(0, y.numel(), [&](int64_t i) { _y[pv[i]] = _x[i]; });
   });
   return y;
 }
@@ -63,9 +62,7 @@ NdArrayRef applyInvPerm(const NdArrayRef& x, const NdArrayRef& pv) {
     const auto pv_field = pv.eltype().as<Ring2k>()->field();
     DISPATCH_ALL_FIELDS(pv_field, [&]() {
       NdArrayView<ring2k_t> _pv(pv);
-      for (int64_t i = 0; i < y.numel(); i++) {
-        _y[_pv[i]] = _x[i];
-      }
+      pforeach(0, y.numel(), [&](int64_t i) { _y[_pv[i]] = _x[i]; });
     });
   });
   return y;
@@ -79,9 +76,7 @@ NdArrayRef applyPerm(const NdArrayRef& x, absl::Span<const int64_t> pv) {
   DISPATCH_ALL_FIELDS(field, [&]() {
     NdArrayView<ring2k_t> _x(x);
     NdArrayView<ring2k_t> _y(y);
-    for (int64_t i = 0; i < y.numel(); i++) {
-      _y[i] = _x[pv[i]];
-    }
+    pforeach(0, y.numel(), [&](int64_t i) { _y[i] = _x[pv[i]]; });
   });
   return y;
 }
@@ -98,9 +93,7 @@ NdArrayRef applyPerm(const NdArrayRef& x, const NdArrayRef& pv) {
     const auto pv_field = pv.eltype().as<Ring2k>()->field();
     DISPATCH_ALL_FIELDS(pv_field, [&]() {
       NdArrayView<ring2k_t> _pv(pv);
-      for (int64_t i = 0; i < y.numel(); i++) {
-        _y[i] = _x[_pv[i]];
-      }
+      pforeach(0, y.numel(), [&](int64_t i) { _y[i] = _x[_pv[i]]; });
     });
   });
   return y;
@@ -112,9 +105,7 @@ NdArrayRef genInversePerm(const NdArrayRef& perm) {
   DISPATCH_ALL_FIELDS(field, [&]() {
     NdArrayView<ring2k_t> _ret(ret);
     NdArrayView<ring2k_t> _perm(perm);
-    for (int64_t i = 0; i < perm.numel(); ++i) {
-      _ret[_perm[i]] = ring2k_t(i);
-    }
+    pforeach(0, perm.numel(), [&](int64_t i) { _ret[_perm[i]] = ring2k_t(i); });
   });
   return ret;
 }
diff --git a/libspu/spu.proto b/libspu/spu.proto
index a9050c5df..5a4adbe2e 100644
--- a/libspu/spu.proto
+++ b/libspu/spu.proto
@@ -80,13 +80,13 @@ enum PtType {
   PT_I128 = 9;     // int128_t
   PT_U128 = 10;    // uint128_t
   PT_I1 = 11;      // bool
-                   //
-  PT_F16 = 30;     // half
-  PT_F32 = 31;     // float
-  PT_F64 = 32;     // double
-                   //
-  PT_CF32 = 50;    // complex float
-  PT_CF64 = 51;    // complex double
+
+  PT_F16 = 30;  // half
+  PT_F32 = 31;  // float
+  PT_F64 = 32;  // double
+
+  PT_CF32 = 50;  // complex float
+  PT_CF64 = 51;  // complex double
 }
 
 // A security parameter type.
@@ -228,6 +228,23 @@ message RuntimeConfig {
   // default: 128 * 1024 * 1024
   uint64 share_max_chunk_size = 20;
 
+  enum SortMethod {
+    SORT_DEFAULT = 0;  // Implementation defined.
+    SORT_RADIX = 1;    // The radix sort (stable sort, need efficient shuffle).
+    SORT_QUICK = 2;    // The quick sort (unstable, need efficient shuffle).
+    SORT_NETWORK = 3;  // The odd-even sorting network (unstable, most general).
+  }
+
+  // SPU supports multiple sorting algorithms.
+  //  -for 2pc, only sorting network is supported.
+  //  -for 2.5pc or 3pc, all these algorithms are supported.
+  //  -for stable sort, only radix sort is supported.
+  SortMethod sort_method = 21;
+
+  // threshold for quick sort, when the size of the array is less than this
+  // value, use merge sort instead
+  int64 quick_sort_threshold = 22;
+
   // @exclude
   // Fixed-point arithmetic related, reserved for [50, 100)
 
@@ -353,6 +370,20 @@ message RuntimeConfig {
   bool experimental_exp_prime_enable_upper_bound = 109;
 }
 
+message ClientSSLConfig {
+  // Certificate in PEM format, supported both file path and raw string
+  string certificate = 1;
+  // Private key in PEM format, supported both file path and raw string based on
+  // prefix
+  string private_key = 2;
+  // The trusted CA file to verify the peer's certificate
+  // If empty, use the system default CA files
+  string ca_file_path = 3;
+  // Maximum depth of the certificate chain for verification
+  // If 0, turn off the verification
+  int32 verify_depth = 4;
+}
+
 message TTPBeaverConfig {
   // TrustedThirdParty beaver server's remote ip:port or load-balance uri.
   string server_host = 1;
@@ -363,10 +394,14 @@ message TTPBeaverConfig {
   // asym_crypto_schema: support ["SM2"]
   // Will support 25519 in the future, after yacl supported it.
   string asym_crypto_schema = 3;
-  // server's public key
-  bytes server_public_key = 4;
+  // Server's public key in PEM format
+  string server_public_key = 4;
+
+  // Transport protocol, support ["http", "h2"]
+  string transport_protocol = 5;
 
-  // TODO: TLS & brpc options.
+  // Configurations related to SSL
+  ClientSSLConfig ssl_config = 6;
 }
 
 enum CheetahOtKind {
diff --git a/libspu/version.h b/libspu/version.h.in
similarity index 94%
rename from libspu/version.h
rename to libspu/version.h.in
index 8ab54eea4..25251c30f 100644
--- a/libspu/version.h
+++ b/libspu/version.h.in
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#define SPU_VERSION "0.9.4.dev$$DATE$$"
+#define SPU_VERSION "@SPU_VERSION@"
 
 #include <string_view>
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 33bcd4da8..6ba68f151 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,3 +1,12 @@
+# basic deps
+grpcio==1.66.0
+numpy>=1.22.0, <2 # FIXME: for SF compatibility
+protobuf==5.27.3
+cloudpickle>=2.0.0
+multiprocess>=0.70.12.2
+cachetools>=5.0.0
+jax[cpu]>=0.4.16, <=0.4.34 # FIXME: Jax 0.4.26+ select perf issue
+termcolor>=2.0.0
 pandas>=1.4.2
 flax<0.10.0
 scikit-learn<1.6.0
@@ -6,3 +15,17 @@ absl-py>=1.1.0
 tensorflow-cpu>=2.12.0; sys_platform == "linux" and platform_machine == 'x86_64'
 tensorflow>=2.12.0; sys_platform != "linux" or platform_machine != 'x86_64'
 h5py!=3.11.0; platform_machine == 'aarch64'
+
+# for examples
+dm-haiku
+plotnine
+jraph
+optax
+torch==2.3.0
+torch_xla==2.3.0; sys_platform == "linux" and platform_machine == 'x86_64'
+torchvision
+tensorflow_datasets
+keras
+setuptools<71.0.0 # https://github.com/pypa/setuptools/issues/4487
+transformers
+datasets
diff --git a/requirements.txt b/requirements.txt
index 2c52e8279..dc76ff0fa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-grpcio>=1.42.0,!=1.48.0
+grpcio==1.66.0
 numpy>=1.22.0, <2 # FIXME: for SF compatibility
-protobuf>=4, <5
+protobuf==5.27.3
 cloudpickle>=2.0.0
 multiprocess>=0.70.12.2
 cachetools>=5.0.0
diff --git a/requirements_dev_lock.txt b/requirements_dev_lock.txt
new file mode 100644
index 000000000..d0f118830
--- /dev/null
+++ b/requirements_dev_lock.txt
@@ -0,0 +1,2732 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    bazel run //:requirements-dev.update
+#
+absl-py==2.1.0 \
+    --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \
+    --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff
+    # via
+    #   -r requirements-dev.txt
+    #   array-record
+    #   chex
+    #   dm-haiku
+    #   etils
+    #   keras
+    #   optax
+    #   orbax-checkpoint
+    #   tensorboard
+    #   tensorflow-cpu
+    #   tensorflow-datasets
+    #   tensorflow-metadata
+    #   torch-xla
+aiohappyeyeballs==2.4.4 \
+    --hash=sha256:5fdd7d87889c63183afc18ce9271f9b0a7d32c2303e394468dd45d514a757745 \
+    --hash=sha256:a980909d50efcd44795c4afeca523296716d50cd756ddca6af8c65b996e27de8
+    # via aiohttp
+aiohttp==3.11.11 \
+    --hash=sha256:0882c2820fd0132240edbb4a51eb8ceb6eef8181db9ad5291ab3332e0d71df5f \
+    --hash=sha256:0a6d3fbf2232e3a08c41eca81ae4f1dff3d8f1a30bae415ebe0af2d2458b8a33 \
+    --hash=sha256:0b7fb429ab1aafa1f48578eb315ca45bd46e9c37de11fe45c7f5f4138091e2f1 \
+    --hash=sha256:0eb98d90b6690827dcc84c246811feeb4e1eea683c0eac6caed7549be9c84665 \
+    --hash=sha256:0fd82b8e9c383af11d2b26f27a478640b6b83d669440c0a71481f7c865a51da9 \
+    --hash=sha256:10b4ff0ad793d98605958089fabfa350e8e62bd5d40aa65cdc69d6785859f94e \
+    --hash=sha256:1642eceeaa5ab6c9b6dfeaaa626ae314d808188ab23ae196a34c9d97efb68350 \
+    --hash=sha256:1dac54e8ce2ed83b1f6b1a54005c87dfed139cf3f777fdc8afc76e7841101226 \
+    --hash=sha256:1e69966ea6ef0c14ee53ef7a3d68b564cc408121ea56c0caa2dc918c1b2f553d \
+    --hash=sha256:1f21bb8d0235fc10c09ce1d11ffbd40fc50d3f08a89e4cf3a0c503dc2562247a \
+    --hash=sha256:2170816e34e10f2fd120f603e951630f8a112e1be3b60963a1f159f5699059a6 \
+    --hash=sha256:21fef42317cf02e05d3b09c028712e1d73a9606f02467fd803f7c1f39cc59add \
+    --hash=sha256:249cc6912405917344192b9f9ea5cd5b139d49e0d2f5c7f70bdfaf6b4dbf3a2e \
+    --hash=sha256:3499c7ffbfd9c6a3d8d6a2b01c26639da7e43d47c7b4f788016226b1e711caa8 \
+    --hash=sha256:3af41686ccec6a0f2bdc66686dc0f403c41ac2089f80e2214a0f82d001052c03 \
+    --hash=sha256:3e23419d832d969f659c208557de4a123e30a10d26e1e14b73431d3c13444c2e \
+    --hash=sha256:3ea1b59dc06396b0b424740a10a0a63974c725b1c64736ff788a3689d36c02d2 \
+    --hash=sha256:44167fc6a763d534a6908bdb2592269b4bf30a03239bcb1654781adf5e49caf1 \
+    --hash=sha256:479b8c6ebd12aedfe64563b85920525d05d394b85f166b7873c8bde6da612f9c \
+    --hash=sha256:4af57160800b7a815f3fe0eba9b46bf28aafc195555f1824555fa2cfab6c1538 \
+    --hash=sha256:4b4fa1cb5f270fb3eab079536b764ad740bb749ce69a94d4ec30ceee1b5940d5 \
+    --hash=sha256:4eed954b161e6b9b65f6be446ed448ed3921763cc432053ceb606f89d793927e \
+    --hash=sha256:541d823548ab69d13d23730a06f97460f4238ad2e5ed966aaf850d7c369782d9 \
+    --hash=sha256:568c1236b2fde93b7720f95a890741854c1200fba4a3471ff48b2934d2d93fd3 \
+    --hash=sha256:5854be2f3e5a729800bac57a8d76af464e160f19676ab6aea74bde18ad19d438 \
+    --hash=sha256:620598717fce1b3bd14dd09947ea53e1ad510317c85dda2c9c65b622edc96b12 \
+    --hash=sha256:6526e5fb4e14f4bbf30411216780c9967c20c5a55f2f51d3abd6de68320cc2f3 \
+    --hash=sha256:6fba278063559acc730abf49845d0e9a9e1ba74f85f0ee6efd5803f08b285853 \
+    --hash=sha256:70d1f9dde0e5dd9e292a6d4d00058737052b01f3532f69c0c65818dac26dc287 \
+    --hash=sha256:731468f555656767cda219ab42e033355fe48c85fbe3ba83a349631541715ba2 \
+    --hash=sha256:81b8fe282183e4a3c7a1b72f5ade1094ed1c6345a8f153506d114af5bf8accd9 \
+    --hash=sha256:84a585799c58b795573c7fa9b84c455adf3e1d72f19a2bf498b54a95ae0d194c \
+    --hash=sha256:85992ee30a31835fc482468637b3e5bd085fa8fe9392ba0bdcbdc1ef5e9e3c55 \
+    --hash=sha256:8811f3f098a78ffa16e0ea36dffd577eb031aea797cbdba81be039a4169e242c \
+    --hash=sha256:88a12ad8ccf325a8a5ed80e6d7c3bdc247d66175afedbe104ee2aaca72960d8e \
+    --hash=sha256:8be8508d110d93061197fd2d6a74f7401f73b6d12f8822bbcd6d74f2b55d71b1 \
+    --hash=sha256:8e2bf8029dbf0810c7bfbc3e594b51c4cc9101fbffb583a3923aea184724203c \
+    --hash=sha256:929f3ed33743a49ab127c58c3e0a827de0664bfcda566108989a14068f820194 \
+    --hash=sha256:92cde43018a2e17d48bb09c79e4d4cb0e236de5063ce897a5e40ac7cb4878773 \
+    --hash=sha256:92fc484e34b733704ad77210c7957679c5c3877bd1e6b6d74b185e9320cc716e \
+    --hash=sha256:943a8b052e54dfd6439fd7989f67fc6a7f2138d0a2cf0a7de5f18aa4fe7eb3b1 \
+    --hash=sha256:9d73ee3725b7a737ad86c2eac5c57a4a97793d9f442599bea5ec67ac9f4bdc3d \
+    --hash=sha256:9f5b3c1ed63c8fa937a920b6c1bec78b74ee09593b3f5b979ab2ae5ef60d7600 \
+    --hash=sha256:9fd46ce0845cfe28f108888b3ab17abff84ff695e01e73657eec3f96d72eef34 \
+    --hash=sha256:a344d5dc18074e3872777b62f5f7d584ae4344cd6006c17ba12103759d407af3 \
+    --hash=sha256:a60804bff28662cbcf340a4d61598891f12eea3a66af48ecfdc975ceec21e3c8 \
+    --hash=sha256:a8f5f7515f3552d899c61202d99dcb17d6e3b0de777900405611cd747cecd1b8 \
+    --hash=sha256:a9b7371665d4f00deb8f32208c7c5e652059b0fda41cf6dbcac6114a041f1cc2 \
+    --hash=sha256:aa54f8ef31d23c506910c21163f22b124facb573bff73930735cf9fe38bf7dff \
+    --hash=sha256:aba807f9569455cba566882c8938f1a549f205ee43c27b126e5450dc9f83cc62 \
+    --hash=sha256:ae545f31489548c87b0cced5755cfe5a5308d00407000e72c4fa30b19c3220ac \
+    --hash=sha256:af01e42ad87ae24932138f154105e88da13ce7d202a6de93fafdafb2883a00ef \
+    --hash=sha256:b540bd67cfb54e6f0865ceccd9979687210d7ed1a1cc8c01f8e67e2f1e883d28 \
+    --hash=sha256:b6212a60e5c482ef90f2d788835387070a88d52cf6241d3916733c9176d39eab \
+    --hash=sha256:b63de12e44935d5aca7ed7ed98a255a11e5cb47f83a9fded7a5e41c40277d104 \
+    --hash=sha256:ba74ec819177af1ef7f59063c6d35a214a8fde6f987f7661f4f0eecc468a8f76 \
+    --hash=sha256:bb49c7f1e6ebf3821a42d81d494f538107610c3a705987f53068546b0e90303e \
+    --hash=sha256:bd176afcf8f5d2aed50c3647d4925d0db0579d96f75a31e77cbaf67d8a87742d \
+    --hash=sha256:bd7227b87a355ce1f4bf83bfae4399b1f5bb42e0259cb9405824bd03d2f4336a \
+    --hash=sha256:bf8d9bfee991d8acc72d060d53860f356e07a50f0e0d09a8dfedea1c554dd0d5 \
+    --hash=sha256:bfde76a8f430cf5c5584553adf9926534352251d379dcb266ad2b93c54a29745 \
+    --hash=sha256:c341c7d868750e31961d6d8e60ff040fb9d3d3a46d77fd85e1ab8e76c3e9a5c4 \
+    --hash=sha256:c7a06301c2fb096bdb0bd25fe2011531c1453b9f2c163c8031600ec73af1cc99 \
+    --hash=sha256:cb23d8bb86282b342481cad4370ea0853a39e4a32a0042bb52ca6bdde132df43 \
+    --hash=sha256:d119fafe7b634dbfa25a8c597718e69a930e4847f0b88e172744be24515140da \
+    --hash=sha256:d40f9da8cabbf295d3a9dae1295c69975b86d941bc20f0a087f0477fa0a66231 \
+    --hash=sha256:d6c9af134da4bc9b3bd3e6a70072509f295d10ee60c697826225b60b9959acdd \
+    --hash=sha256:dd7659baae9ccf94ae5fe8bfaa2c7bc2e94d24611528395ce88d009107e00c6d \
+    --hash=sha256:de8d38f1c2810fa2a4f1d995a2e9c70bb8737b18da04ac2afbf3971f65781d87 \
+    --hash=sha256:e595c591a48bbc295ebf47cb91aebf9bd32f3ff76749ecf282ea7f9f6bb73886 \
+    --hash=sha256:ec2aa89305006fba9ffb98970db6c8221541be7bee4c1d027421d6f6df7d1ce2 \
+    --hash=sha256:ec82bf1fda6cecce7f7b915f9196601a1bd1a3079796b76d16ae4cce6d0ef89b \
+    --hash=sha256:ed9ee95614a71e87f1a70bc81603f6c6760128b140bc4030abe6abaa988f1c3d \
+    --hash=sha256:f047569d655f81cb70ea5be942ee5d4421b6219c3f05d131f64088c73bb0917f \
+    --hash=sha256:ffa336210cf9cd8ed117011085817d00abe4c08f99968deef0013ea283547204 \
+    --hash=sha256:ffb3dc385f6bb1568aa974fe65da84723210e5d9707e360e9ecb51f59406cd2e
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.2 \
+    --hash=sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5 \
+    --hash=sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54
+    # via aiohttp
+array-record==0.6.0 \
+    --hash=sha256:035575c271461f26a0684db5e3b65a487233d0921880933f680e7aeb86130a39 \
+    --hash=sha256:1ea2596fb8bf19eade5e8c2d0dce9c4dc6a9d14222551863d32238f7e5754afe \
+    --hash=sha256:370cf9bdcdaab7537e897aae017ea607f75ac33378991d2fbb1e52b1fedb2bcf \
+    --hash=sha256:4c85df128819191a4f85937ab390f59f181ab7b6183626e5d0f5ecab47ecb022 \
+    --hash=sha256:5338900974e2f10b3021b874a4f226783ffdbb0be76c931363a557336d33e478 \
+    --hash=sha256:af81f6ae5404a42962b96f4efacd9a9b098cb2eeddae068cde9be0b8bfbfc457 \
+    --hash=sha256:b28be32f7c81db3ec17d343899a6b5b8ae19f6d6e650448b8044de65774fa3e5 \
+    --hash=sha256:c418b2b83410c630e6662d4ce0156e4e5120ee27ea9ed7672dd87c9cda39a060 \
+    --hash=sha256:c51b53b90c7d4035ae94e8b265196925e6c5f5673aa35e04874aecca78656de3
+    # via tensorflow-datasets
+astunparse==1.6.3 \
+    --hash=sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872 \
+    --hash=sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8
+    # via tensorflow-cpu
+attrs==24.3.0 \
+    --hash=sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff \
+    --hash=sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308
+    # via aiohttp
+cachetools==5.5.0 \
+    --hash=sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292 \
+    --hash=sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a
+    # via
+    #   -r requirements-dev.txt
+    #   google-auth
+certifi==2024.12.14 \
+    --hash=sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56 \
+    --hash=sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db
+    # via requests
+charset-normalizer==3.4.1 \
+    --hash=sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537 \
+    --hash=sha256:01732659ba9b5b873fc117534143e4feefecf3b2078b0a6a2e925271bb6f4cfa \
+    --hash=sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a \
+    --hash=sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294 \
+    --hash=sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b \
+    --hash=sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd \
+    --hash=sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601 \
+    --hash=sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd \
+    --hash=sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4 \
+    --hash=sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d \
+    --hash=sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2 \
+    --hash=sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313 \
+    --hash=sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd \
+    --hash=sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa \
+    --hash=sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8 \
+    --hash=sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1 \
+    --hash=sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2 \
+    --hash=sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496 \
+    --hash=sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d \
+    --hash=sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b \
+    --hash=sha256:2fb9bd477fdea8684f78791a6de97a953c51831ee2981f8e4f583ff3b9d9687e \
+    --hash=sha256:311f30128d7d333eebd7896965bfcfbd0065f1716ec92bd5638d7748eb6f936a \
+    --hash=sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4 \
+    --hash=sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca \
+    --hash=sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78 \
+    --hash=sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408 \
+    --hash=sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5 \
+    --hash=sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3 \
+    --hash=sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f \
+    --hash=sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a \
+    --hash=sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765 \
+    --hash=sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6 \
+    --hash=sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146 \
+    --hash=sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6 \
+    --hash=sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9 \
+    --hash=sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd \
+    --hash=sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c \
+    --hash=sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f \
+    --hash=sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545 \
+    --hash=sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176 \
+    --hash=sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770 \
+    --hash=sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824 \
+    --hash=sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f \
+    --hash=sha256:7974a0b5ecd505609e3b19742b60cee7aa2aa2fb3151bc917e6e2646d7667dcf \
+    --hash=sha256:7a4f97a081603d2050bfaffdefa5b02a9ec823f8348a572e39032caa8404a487 \
+    --hash=sha256:7b1bef6280950ee6c177b326508f86cad7ad4dff12454483b51d8b7d673a2c5d \
+    --hash=sha256:7d053096f67cd1241601111b698f5cad775f97ab25d81567d3f59219b5f1adbd \
+    --hash=sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b \
+    --hash=sha256:807f52c1f798eef6cf26beb819eeb8819b1622ddfeef9d0977a8502d4db6d534 \
+    --hash=sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f \
+    --hash=sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b \
+    --hash=sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9 \
+    --hash=sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd \
+    --hash=sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125 \
+    --hash=sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9 \
+    --hash=sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de \
+    --hash=sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11 \
+    --hash=sha256:97f68b8d6831127e4787ad15e6757232e14e12060bec17091b85eb1486b91d8d \
+    --hash=sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35 \
+    --hash=sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f \
+    --hash=sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda \
+    --hash=sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7 \
+    --hash=sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a \
+    --hash=sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971 \
+    --hash=sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8 \
+    --hash=sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41 \
+    --hash=sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d \
+    --hash=sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f \
+    --hash=sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757 \
+    --hash=sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a \
+    --hash=sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886 \
+    --hash=sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77 \
+    --hash=sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76 \
+    --hash=sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247 \
+    --hash=sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85 \
+    --hash=sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb \
+    --hash=sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7 \
+    --hash=sha256:dccbe65bd2f7f7ec22c4ff99ed56faa1e9f785482b9bbd7c717e26fd723a1d1e \
+    --hash=sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6 \
+    --hash=sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037 \
+    --hash=sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1 \
+    --hash=sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e \
+    --hash=sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807 \
+    --hash=sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407 \
+    --hash=sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c \
+    --hash=sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12 \
+    --hash=sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3 \
+    --hash=sha256:f30bf9fd9be89ecb2360c7d94a711f00c09b976258846efe40db3d05828e8089 \
+    --hash=sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd \
+    --hash=sha256:fc54db6c8593ef7d4b2a331b58653356cf04f67c960f584edb7c3d8c97e8f39e \
+    --hash=sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00 \
+    --hash=sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616
+    # via requests
+chex==0.1.88 \
+    --hash=sha256:234b61a5baa8132802e4b9c5657167d6c8a911d90a59a0bec47d537567e41b75 \
+    --hash=sha256:565de897b1373232cdfca5e699f50fa49403d2c7d23f6c5a75a97ef713d2fe36
+    # via optax
+click==8.1.8 \
+    --hash=sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2 \
+    --hash=sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a
+    # via tensorflow-datasets
+cloud-tpu-client==0.10 \
+    --hash=sha256:e3ee7a0a69c3fdbfc82826d86762f24e43bfcd6096af047185708fe062c7f849
+    # via torch-xla
+cloudpickle==3.1.0 \
+    --hash=sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b \
+    --hash=sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e
+    # via -r requirements-dev.txt
+contourpy==1.3.1 \
+    --hash=sha256:041b640d4ec01922083645a94bb3b2e777e6b626788f4095cf21abbe266413c1 \
+    --hash=sha256:05e806338bfeaa006acbdeba0ad681a10be63b26e1b17317bfac3c5d98f36cda \
+    --hash=sha256:08d9d449a61cf53033612cb368f3a1b26cd7835d9b8cd326647efe43bca7568d \
+    --hash=sha256:0ffa84be8e0bd33410b17189f7164c3589c229ce5db85798076a3fa136d0e509 \
+    --hash=sha256:113231fe3825ebf6f15eaa8bc1f5b0ddc19d42b733345eae0934cb291beb88b6 \
+    --hash=sha256:14c102b0eab282427b662cb590f2e9340a9d91a1c297f48729431f2dcd16e14f \
+    --hash=sha256:174e758c66bbc1c8576992cec9599ce8b6672b741b5d336b5c74e35ac382b18e \
+    --hash=sha256:19c1555a6801c2f084c7ddc1c6e11f02eb6a6016ca1318dd5452ba3f613a1751 \
+    --hash=sha256:19d40d37c1c3a4961b4619dd9d77b12124a453cc3d02bb31a07d58ef684d3d86 \
+    --hash=sha256:1bf98051f1045b15c87868dbaea84f92408337d4f81d0e449ee41920ea121d3b \
+    --hash=sha256:20914c8c973f41456337652a6eeca26d2148aa96dd7ac323b74516988bea89fc \
+    --hash=sha256:287ccc248c9e0d0566934e7d606201abd74761b5703d804ff3df8935f523d546 \
+    --hash=sha256:2ba94a401342fc0f8b948e57d977557fbf4d515f03c67682dd5c6191cb2d16ec \
+    --hash=sha256:31c1b55c1f34f80557d3830d3dd93ba722ce7e33a0b472cba0ec3b6535684d8f \
+    --hash=sha256:36987a15e8ace5f58d4d5da9dca82d498c2bbb28dff6e5d04fbfcc35a9cb3a82 \
+    --hash=sha256:3a04ecd68acbd77fa2d39723ceca4c3197cb2969633836ced1bea14e219d077c \
+    --hash=sha256:3e8b974d8db2c5610fb4e76307e265de0edb655ae8169e8b21f41807ccbeec4b \
+    --hash=sha256:3ea9924d28fc5586bf0b42d15f590b10c224117e74409dd7a0be3b62b74a501c \
+    --hash=sha256:4318af1c925fb9a4fb190559ef3eec206845f63e80fb603d47f2d6d67683901c \
+    --hash=sha256:44a29502ca9c7b5ba389e620d44f2fbe792b1fb5734e8b931ad307071ec58c53 \
+    --hash=sha256:47734d7073fb4590b4a40122b35917cd77be5722d80683b249dac1de266aac80 \
+    --hash=sha256:4d76d5993a34ef3df5181ba3c92fabb93f1eaa5729504fb03423fcd9f3177242 \
+    --hash=sha256:4dbbc03a40f916a8420e420d63e96a1258d3d1b58cbdfd8d1f07b49fcbd38e85 \
+    --hash=sha256:500360b77259914f7805af7462e41f9cb7ca92ad38e9f94d6c8641b089338124 \
+    --hash=sha256:523a8ee12edfa36f6d2a49407f705a6ef4c5098de4f498619787e272de93f2d5 \
+    --hash=sha256:573abb30e0e05bf31ed067d2f82500ecfdaec15627a59d63ea2d95714790f5c2 \
+    --hash=sha256:5b75aa69cb4d6f137b36f7eb2ace9280cfb60c55dc5f61c731fdf6f037f958a3 \
+    --hash=sha256:61332c87493b00091423e747ea78200659dc09bdf7fd69edd5e98cef5d3e9a8d \
+    --hash=sha256:805617228ba7e2cbbfb6c503858e626ab528ac2a32a04a2fe88ffaf6b02c32bc \
+    --hash=sha256:841ad858cff65c2c04bf93875e384ccb82b654574a6d7f30453a04f04af71342 \
+    --hash=sha256:89785bb2a1980c1bd87f0cb1517a71cde374776a5f150936b82580ae6ead44a1 \
+    --hash=sha256:8eb96e79b9f3dcadbad2a3891672f81cdcab7f95b27f28f1c67d75f045b6b4f1 \
+    --hash=sha256:974d8145f8ca354498005b5b981165b74a195abfae9a8129df3e56771961d595 \
+    --hash=sha256:9ddeb796389dadcd884c7eb07bd14ef12408aaae358f0e2ae24114d797eede30 \
+    --hash=sha256:a045f341a77b77e1c5de31e74e966537bba9f3c4099b35bf4c2e3939dd54cdab \
+    --hash=sha256:a0cffcbede75c059f535725c1680dfb17b6ba8753f0c74b14e6a9c68c29d7ea3 \
+    --hash=sha256:a761d9ccfc5e2ecd1bf05534eda382aa14c3e4f9205ba5b1684ecfe400716ef2 \
+    --hash=sha256:a7895f46d47671fa7ceec40f31fae721da51ad34bdca0bee83e38870b1f47ffd \
+    --hash=sha256:a9fa36448e6a3a1a9a2ba23c02012c43ed88905ec80163f2ffe2421c7192a5d7 \
+    --hash=sha256:ab29962927945d89d9b293eabd0d59aea28d887d4f3be6c22deaefbb938a7277 \
+    --hash=sha256:abbb49fb7dac584e5abc6636b7b2a7227111c4f771005853e7d25176daaf8453 \
+    --hash=sha256:ac4578ac281983f63b400f7fe6c101bedc10651650eef012be1ccffcbacf3697 \
+    --hash=sha256:adce39d67c0edf383647a3a007de0a45fd1b08dedaa5318404f1a73059c2512b \
+    --hash=sha256:ade08d343436a94e633db932e7e8407fe7de8083967962b46bdfc1b0ced39454 \
+    --hash=sha256:b2bdca22a27e35f16794cf585832e542123296b4687f9fd96822db6bae17bfc9 \
+    --hash=sha256:b2f926efda994cdf3c8d3fdb40b9962f86edbc4457e739277b961eced3d0b4c1 \
+    --hash=sha256:b457d6430833cee8e4b8e9b6f07aa1c161e5e0d52e118dc102c8f9bd7dd060d6 \
+    --hash=sha256:c414fc1ed8ee1dbd5da626cf3710c6013d3d27456651d156711fa24f24bd1291 \
+    --hash=sha256:cb76c1a154b83991a3cbbf0dfeb26ec2833ad56f95540b442c73950af2013750 \
+    --hash=sha256:dfd97abd83335045a913e3bcc4a09c0ceadbe66580cf573fe961f4a825efa699 \
+    --hash=sha256:e914a8cb05ce5c809dd0fe350cfbb4e881bde5e2a38dc04e3afe1b3e58bd158e \
+    --hash=sha256:ece6df05e2c41bd46776fbc712e0996f7c94e0d0543af1656956d150c4ca7c81 \
+    --hash=sha256:efa874e87e4a647fd2e4f514d5e91c7d493697127beb95e77d2f7561f6905bd9 \
+    --hash=sha256:f611e628ef06670df83fce17805c344710ca5cde01edfdc72751311da8585375
+    # via matplotlib
+cycler==0.12.1 \
+    --hash=sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30 \
+    --hash=sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c
+    # via matplotlib
+datasets==2.2.1 \
+    --hash=sha256:1938f3e99599422de50b9b54fe802aca854ed130382dab0b3820c821f7ae6d5e \
+    --hash=sha256:d362717c4394589b516c8f397ff20a6fe720454aed877ab61d06f3bc05df9544
+    # via -r requirements-dev.txt
+dill==0.3.9 \
+    --hash=sha256:468dff3b89520b474c0397703366b7b95eebe6303f108adf9b19da1f702be87a \
+    --hash=sha256:81aa267dddf68cbfe8029c42ca9ec6a4ab3b22371d1c450abc54422577b4512c
+    # via
+    #   datasets
+    #   multiprocess
+dm-haiku==0.0.13 \
+    --hash=sha256:029bb91b5b1edb0d3fe23304d3bf12a545ea6e485041f7f5d8c8d85ebcf6e17d \
+    --hash=sha256:ee9562c68a059f146ad07f555ca591cb8c11ef751afecc38353863562bd23f43
+    # via -r requirements-dev.txt
+dm-tree==0.1.8 \
+    --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \
+    --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \
+    --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \
+    --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \
+    --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \
+    --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \
+    --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \
+    --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \
+    --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \
+    --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \
+    --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \
+    --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \
+    --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \
+    --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \
+    --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \
+    --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \
+    --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \
+    --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \
+    --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \
+    --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \
+    --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \
+    --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \
+    --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \
+    --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \
+    --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \
+    --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \
+    --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \
+    --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \
+    --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \
+    --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \
+    --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \
+    --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \
+    --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \
+    --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \
+    --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \
+    --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \
+    --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \
+    --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \
+    --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \
+    --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \
+    --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \
+    --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \
+    --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \
+    --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \
+    --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \
+    --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d
+    # via tensorflow-datasets
+docstring-parser==0.16 \
+    --hash=sha256:538beabd0af1e2db0146b6bd3caa526c35a34d61af9fd2887f3a8a27a739aa6e \
+    --hash=sha256:bf0a1387354d3691d102edef7ec124f219ef639982d096e26e3b60aeffa90637
+    # via simple-parsing
+etils[array-types,edc,enp,epath,epy,etqdm,etree]==1.11.0 \
+    --hash=sha256:a394cf3476bcec51c221426a70c39cd1006e889456ba41e4d7f12fd6814be7a5 \
+    --hash=sha256:aff3278a3be7fddf302dfd80335e9f924244666c71239cd91e836f3d055f1c4a
+    # via
+    #   array-record
+    #   optax
+    #   orbax-checkpoint
+    #   tensorflow-datasets
+filelock==3.16.1 \
+    --hash=sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0 \
+    --hash=sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+    #   triton
+flatbuffers==24.12.23 \
+    --hash=sha256:2910b0bc6ae9b6db78dd2b18d0b7a0709ba240fb5585f286a3a2b30785c22dac \
+    --hash=sha256:c418e0d48890f4142b92fd3e343e73a48f194e1f80075ddcc5793779b3585444
+    # via tensorflow-cpu
+flax==0.9.0 \
+    --hash=sha256:12cd8f7162165ddd56877fb1cd9a4fcb47a31569e4c5343eeb59a36369fa2cfe \
+    --hash=sha256:8b7f361eed0f5324e81f9dc8d02ea53da5f993d7c2e37e7aa5b37d3f6331dd53
+    # via -r requirements-dev.txt
+fonttools==4.55.3 \
+    --hash=sha256:07f8288aacf0a38d174445fc78377a97fb0b83cfe352a90c9d9c1400571963c7 \
+    --hash=sha256:11e5de1ee0d95af4ae23c1a138b184b7f06e0b6abacabf1d0db41c90b03d834b \
+    --hash=sha256:1bc7ad24ff98846282eef1cbeac05d013c2154f977a79886bb943015d2b1b261 \
+    --hash=sha256:1dcc07934a2165ccdc3a5a608db56fb3c24b609658a5b340aee4ecf3ba679dc0 \
+    --hash=sha256:22f38464daa6cdb7b6aebd14ab06609328fe1e9705bb0fcc7d1e69de7109ee02 \
+    --hash=sha256:27e4ae3592e62eba83cd2c4ccd9462dcfa603ff78e09110680a5444c6925d841 \
+    --hash=sha256:3983313c2a04d6cc1fe9251f8fc647754cf49a61dac6cb1e7249ae67afaafc45 \
+    --hash=sha256:529cef2ce91dc44f8e407cc567fae6e49a1786f2fefefa73a294704c415322a4 \
+    --hash=sha256:5323a22eabddf4b24f66d26894f1229261021dacd9d29e89f7872dd8c63f0b8b \
+    --hash=sha256:54153c49913f45065c8d9e6d0c101396725c5621c8aee744719300f79771d75a \
+    --hash=sha256:546565028e244a701f73df6d8dd6be489d01617863ec0c6a42fa25bf45d43048 \
+    --hash=sha256:5480673f599ad410695ca2ddef2dfefe9df779a9a5cda89503881e503c9c7d90 \
+    --hash=sha256:5e8d657cd7326eeaba27de2740e847c6b39dde2f8d7cd7cc56f6aad404ddf0bd \
+    --hash=sha256:62d65a3022c35e404d19ca14f291c89cc5890032ff04f6c17af0bd1927299674 \
+    --hash=sha256:6314bf82c54c53c71805318fcf6786d986461622dd926d92a465199ff54b1b72 \
+    --hash=sha256:7a8aa2c5e5b8b3bcb2e4538d929f6589a5c6bdb84fd16e2ed92649fb5454f11c \
+    --hash=sha256:827e95fdbbd3e51f8b459af5ea10ecb4e30af50221ca103bea68218e9615de07 \
+    --hash=sha256:859c358ebf41db18fb72342d3080bce67c02b39e86b9fbcf1610cca14984841b \
+    --hash=sha256:86721fbc389ef5cc1e2f477019e5069e8e4421e8d9576e9c26f840dbb04678de \
+    --hash=sha256:89bdc5d88bdeec1b15af790810e267e8332d92561dce4f0748c2b95c9bdf3926 \
+    --hash=sha256:8c4491699bad88efe95772543cd49870cf756b019ad56294f6498982408ab03e \
+    --hash=sha256:8c5ec45428edaa7022f1c949a632a6f298edc7b481312fc7dc258921e9399628 \
+    --hash=sha256:8e75f12c82127486fac2d8bfbf5bf058202f54bf4f158d367e41647b972342ca \
+    --hash=sha256:a430178ad3e650e695167cb53242dae3477b35c95bef6525b074d87493c4bf29 \
+    --hash=sha256:a8c2794ded89399cc2169c4d0bf7941247b8d5932b2659e09834adfbb01589aa \
+    --hash=sha256:aca318b77f23523309eec4475d1fbbb00a6b133eb766a8bdc401faba91261abe \
+    --hash=sha256:ae3b6600565b2d80b7c05acb8e24d2b26ac407b27a3f2e078229721ba5698427 \
+    --hash=sha256:aedbeb1db64496d098e6be92b2e63b5fac4e53b1b92032dfc6988e1ea9134a4d \
+    --hash=sha256:aee3b57643827e237ff6ec6d28d9ff9766bd8b21e08cd13bff479e13d4b14765 \
+    --hash=sha256:b54baf65c52952db65df39fcd4820668d0ef4766c0ccdf32879b77f7c804d5c5 \
+    --hash=sha256:b586ab5b15b6097f2fb71cafa3c98edfd0dba1ad8027229e7b1e204a58b0e09d \
+    --hash=sha256:b8d5e8916c0970fbc0f6f1bece0063363bb5857a7f170121a4493e31c3db3314 \
+    --hash=sha256:bc5dbb4685e51235ef487e4bd501ddfc49be5aede5e40f4cefcccabc6e60fb4b \
+    --hash=sha256:bdcc9f04b36c6c20978d3f060e5323a43f6222accc4e7fcbef3f428e216d96af \
+    --hash=sha256:c3ca99e0d460eff46e033cd3992a969658c3169ffcd533e0a39c63a38beb6831 \
+    --hash=sha256:caf8230f3e10f8f5d7593eb6d252a37caf58c480b19a17e250a63dad63834cf3 \
+    --hash=sha256:cd70de1a52a8ee2d1877b6293af8a2484ac82514f10b1c67c1c5762d38073e56 \
+    --hash=sha256:cf4fe7c124aa3f4e4c1940880156e13f2f4d98170d35c749e6b4f119a872551e \
+    --hash=sha256:d342e88764fb201286d185093781bf6628bbe380a913c24adf772d901baa8276 \
+    --hash=sha256:da9da6d65cd7aa6b0f806556f4985bcbf603bf0c5c590e61b43aa3e5a0f822d0 \
+    --hash=sha256:dc5294a3d5c84226e3dbba1b6f61d7ad813a8c0238fceea4e09aa04848c3d851 \
+    --hash=sha256:dd68c87a2bfe37c5b33bcda0fba39b65a353876d3b9006fde3adae31f97b3ef5 \
+    --hash=sha256:e6e8766eeeb2de759e862004aa11a9ea3d6f6d5ec710551a88b476192b64fd54 \
+    --hash=sha256:e894b5bd60d9f473bed7a8f506515549cc194de08064d829464088d23097331b \
+    --hash=sha256:eb6ca911c4c17eb51853143624d8dc87cdcdf12a711fc38bf5bd21521e79715f \
+    --hash=sha256:ed63959d00b61959b035c7d47f9313c2c1ece090ff63afea702fe86de00dbed4 \
+    --hash=sha256:f412604ccbeee81b091b420272841e5ec5ef68967a9790e80bffd0e30b8e2977 \
+    --hash=sha256:f7d66c15ba875432a2d2fb419523f5d3d347f91f48f57b8b08a2dfc3c39b8a3f \
+    --hash=sha256:f9e736f60f4911061235603a6119e72053073a12c6d7904011df2d8fad2c0e35 \
+    --hash=sha256:fb594b5a99943042c702c550d5494bdd7577f6ef19b0bc73877c948a63184a32
+    # via matplotlib
+frozenlist==1.5.0 \
+    --hash=sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e \
+    --hash=sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf \
+    --hash=sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6 \
+    --hash=sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a \
+    --hash=sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d \
+    --hash=sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f \
+    --hash=sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28 \
+    --hash=sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b \
+    --hash=sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9 \
+    --hash=sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2 \
+    --hash=sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec \
+    --hash=sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2 \
+    --hash=sha256:17dcc32fc7bda7ce5875435003220a457bcfa34ab7924a49a1c19f55b6ee185c \
+    --hash=sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336 \
+    --hash=sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4 \
+    --hash=sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d \
+    --hash=sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b \
+    --hash=sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c \
+    --hash=sha256:2150cc6305a2c2ab33299453e2968611dacb970d2283a14955923062c8d00b10 \
+    --hash=sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08 \
+    --hash=sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942 \
+    --hash=sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8 \
+    --hash=sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f \
+    --hash=sha256:2d0da8bbec082bf6bf18345b180958775363588678f64998c2b7609e34719b10 \
+    --hash=sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5 \
+    --hash=sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6 \
+    --hash=sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21 \
+    --hash=sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c \
+    --hash=sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d \
+    --hash=sha256:374ca2dabdccad8e2a76d40b1d037f5bd16824933bf7bcea3e59c891fd4a0923 \
+    --hash=sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608 \
+    --hash=sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de \
+    --hash=sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17 \
+    --hash=sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0 \
+    --hash=sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f \
+    --hash=sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641 \
+    --hash=sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c \
+    --hash=sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a \
+    --hash=sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0 \
+    --hash=sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9 \
+    --hash=sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab \
+    --hash=sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f \
+    --hash=sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3 \
+    --hash=sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a \
+    --hash=sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784 \
+    --hash=sha256:73f2e31ea8dd7df61a359b731716018c2be196e5bb3b74ddba107f694fbd7604 \
+    --hash=sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d \
+    --hash=sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5 \
+    --hash=sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03 \
+    --hash=sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e \
+    --hash=sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953 \
+    --hash=sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee \
+    --hash=sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d \
+    --hash=sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817 \
+    --hash=sha256:828afae9f17e6de596825cf4228ff28fbdf6065974e5ac1410cecc22f699d2b3 \
+    --hash=sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039 \
+    --hash=sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f \
+    --hash=sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9 \
+    --hash=sha256:91d6c171862df0a6c61479d9724f22efb6109111017c87567cfeb7b5d1449fdf \
+    --hash=sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76 \
+    --hash=sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba \
+    --hash=sha256:97160e245ea33d8609cd2b8fd997c850b56db147a304a262abc2b3be021a9171 \
+    --hash=sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb \
+    --hash=sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439 \
+    --hash=sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631 \
+    --hash=sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972 \
+    --hash=sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d \
+    --hash=sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869 \
+    --hash=sha256:a72b7a6e3cd2725eff67cd64c8f13335ee18fc3c7befc05aed043d24c7b9ccb9 \
+    --hash=sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411 \
+    --hash=sha256:b97f7b575ab4a8af9b7bc1d2ef7f29d3afee2226bd03ca3875c16451ad5a7723 \
+    --hash=sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2 \
+    --hash=sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b \
+    --hash=sha256:c16d2fa63e0800723139137d667e1056bee1a1cf7965153d2d104b62855e9b99 \
+    --hash=sha256:c1fac3e2ace2eb1052e9f7c7db480818371134410e1f5c55d65e8f3ac6d1407e \
+    --hash=sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840 \
+    --hash=sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3 \
+    --hash=sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb \
+    --hash=sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3 \
+    --hash=sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0 \
+    --hash=sha256:dd94994fc91a6177bfaafd7d9fd951bc8689b0a98168aa26b5f543868548d3ca \
+    --hash=sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45 \
+    --hash=sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e \
+    --hash=sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f \
+    --hash=sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5 \
+    --hash=sha256:f1577515d35ed5649d52ab4319db757bb881ce3b2b796d7283e6634d99ace307 \
+    --hash=sha256:f1e6540b7fa044eee0bb5111ada694cf3dc15f2b0347ca125ee9ca984d5e9e6e \
+    --hash=sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2 \
+    --hash=sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778 \
+    --hash=sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a \
+    --hash=sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30 \
+    --hash=sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.12.0 \
+    --hash=sha256:670700c977ed2fb51e0d9f9253177ed20cbde4a3e5c0283cc5385b5870c8533f \
+    --hash=sha256:b520aed47ad9804237ff878b504267a3b0b441e97508bd6d2d8774e3db85cee2
+    # via
+    #   datasets
+    #   etils
+    #   huggingface-hub
+    #   torch
+gast==0.6.0 \
+    --hash=sha256:52b182313f7330389f72b069ba00f174cfe2a06411099547288839c6cbafbd54 \
+    --hash=sha256:88fc5300d32c7ac6ca7b515310862f71e6fdf2c029bbec7c66c0f5dd47b6b1fb
+    # via tensorflow-cpu
+google-api-core==1.16.0 \
+    --hash=sha256:859f7392676761f2b160c6ee030c3422135ada4458f0948c5690a6a7c8d86294 \
+    --hash=sha256:92e962a087f1c4b8d1c5c88ade1c1dfd550047dcffb320c57ef6a534a20403e2
+    # via google-api-python-client
+google-api-python-client==1.8.0 \
+    --hash=sha256:0f5b42a14e2d2f7dee40f2e4514531dbe95ebde9c2173b1c4040a65c427e7900 \
+    --hash=sha256:5032ad1af5046889649b3848f2e871889fbb6ae440198a549fe1699581300386
+    # via cloud-tpu-client
+google-auth==1.6.3 \
+    --hash=sha256:0f7c6a64927d34c1a474da92cfc59e552a5d3b940d3266606c6a28b72888b9e4 \
+    --hash=sha256:20705f6803fd2c4d1cc2dcb0df09d4dfcb9a7d51fd59e94a3a28231fd93119ed
+    # via
+    #   google-api-core
+    #   google-api-python-client
+    #   google-auth-httplib2
+google-auth-httplib2==0.2.0 \
+    --hash=sha256:38aa7badf48f974f1eb9861794e9c0cb2a0511a4ec0679b1f886d108f5640e05 \
+    --hash=sha256:b65a0a2123300dd71281a7bf6e64d65a0759287df52729bdd1ae2e47dc311a3d
+    # via google-api-python-client
+google-pasta==0.2.0 \
+    --hash=sha256:4612951da876b1a10fe3960d7226f0c7682cf901e16ac06e473b267a5afa8954 \
+    --hash=sha256:b32482794a366b5366a32c92a9a9201b107821889935a02b3e51f6b432ea84ed \
+    --hash=sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e
+    # via tensorflow-cpu
+googleapis-common-protos==1.66.0 \
+    --hash=sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c \
+    --hash=sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed
+    # via
+    #   google-api-core
+    #   tensorflow-metadata
+grpcio==1.66.0 \
+    --hash=sha256:0f3010bf46b2a01c9e40644cb9ed91b4b8435e5c500a275da5f9f62580e31e80 \
+    --hash=sha256:1c5466222470cb7fbc9cc898af1d48eefd297cb2e2f59af6d4a851c862fa90ac \
+    --hash=sha256:1eb03524d0f55b965d6c86aa44e5db9e5eaa15f9ed3b164621e652e5b927f4b8 \
+    --hash=sha256:230cdd696751e7eb1395718cd308234749daa217bb8d128f00357dc4df102558 \
+    --hash=sha256:245b08f9b3c645a6a623f3ed4fa43dcfcd6ad701eb9c32511c1bb7380e8c3d23 \
+    --hash=sha256:296a45ea835e12a1cc35ab0c57e455346c272af7b0d178e29c67742167262b4c \
+    --hash=sha256:37514b68a42e9cf24536345d3cf9e580ffd29117c158b4eeea34625200256067 \
+    --hash=sha256:375b58892301a5fc6ca7d7ff689c9dc9d00895f5d560604ace9f4f0573013c63 \
+    --hash=sha256:423ae18637cd99ddcf2e5a6851c61828c49e9b9d022d0442d979b4f230109787 \
+    --hash=sha256:49234580a073ce7ac490112f6c67c874cbcb27804c4525978cdb21ba7f3f193c \
+    --hash=sha256:508411df1f2b7cfa05d4d7dbf3d576fe4f949cd61c03f3a6f0378c84e3d7b963 \
+    --hash=sha256:50cea8ce2552865b87e3dffbb85eb21e6b98d928621600c0feda2f02449cd837 \
+    --hash=sha256:516fdbc8e156db71a004bc431a6303bca24cfde186babe96dde7bd01e8f0cc70 \
+    --hash=sha256:526d4f6ca19f31b25606d5c470ecba55c0b22707b524e4de8987919e8920437d \
+    --hash=sha256:53d4c6706b49e358a2a33345dbe9b6b3bb047cecd7e8c07ba383bd09349bfef8 \
+    --hash=sha256:5b15ef1b296c4e78f15f64fc65bf8081f8774480ffcac45642f69d9d753d9c6b \
+    --hash=sha256:5e8140b39f10d7be2263afa2838112de29374c5c740eb0afd99146cb5bdbd990 \
+    --hash=sha256:5ea27f4ce8c0daccfdd2c7961e6ba404b6599f47c948415c4cca5728739107a3 \
+    --hash=sha256:5f4b3357e59dfba9140a51597287297bc638710d6a163f99ee14efc19967a821 \
+    --hash=sha256:5f93fc84b72bbc7b84a42f3ca9dc055fa00d2303d9803be011ebf7a10a4eb833 \
+    --hash=sha256:643d8d9632a688ae69661e924b862e23c83a3575b24e52917ec5bcc59543d212 \
+    --hash=sha256:684a4c07883cbd4ac864f0d08d927267404f5f0c76f31c85f9bbe05f2daae2f2 \
+    --hash=sha256:6d586a95c05c82a5354be48bb4537e1accaf2472d8eb7e9086d844cbff934482 \
+    --hash=sha256:6ed35bf7da3fb3b1949e32bdf47a8b5ffe0aed11722d948933bd068531cd4682 \
+    --hash=sha256:748452dbd5a047475d5413bdef08b0b9ceb2c0c0e249d4ee905a5fb82c6328dc \
+    --hash=sha256:7bc9d823e05d63a87511fb456dcc48dc0fced86c282bf60229675e7ee7aac1a1 \
+    --hash=sha256:8096a922eb91bc97c839f675c3efa1257c6ef181ae1b25d3fb97f2cae4c57c01 \
+    --hash=sha256:832945e64176520520317b50d64ec7d79924429528d5747669b52d0bf2c7bd78 \
+    --hash=sha256:8fc5c710ddd51b5a0dc36ef1b6663430aa620e0ce029b87b150dafd313b978c3 \
+    --hash=sha256:921b8f7f25d5300d7c6837a1e0639ef145fbdbfb728e0a5db2dbccc9fc0fd891 \
+    --hash=sha256:9d5251578767fe44602688c851c2373b5513048ac84c21a0fe946590a8e7933d \
+    --hash=sha256:a639d3866bfb5a678b5c0b92cd7ab543033ed8988854290fd86145e71731fd4c \
+    --hash=sha256:aaf30c75cbaf30e561ca45f21eb1f729f0fab3f15c592c1074795ed43e3ff96f \
+    --hash=sha256:ad7256f224437b2c29c2bef98ddd3130454c5b1ab1f0471fc11794cefd4dbd3d \
+    --hash=sha256:ba18cfdc09312eb2eea6fa0ce5d2eec3cf345ea78f6528b2eaed6432105e0bd0 \
+    --hash=sha256:ba60ae3b465b3e85080ae3bfbc36fd0305ae495ab16fcf8022fc7d7a23aac846 \
+    --hash=sha256:bc008c6afa1e7c8df99bd9154abc4f0470d26b7730ca2521122e99e771baa8c7 \
+    --hash=sha256:c072f90a1f0409f827ae86266984cba65e89c5831a0726b9fc7f4b5fb940b853 \
+    --hash=sha256:c1ea4c528e7db6660718e4165fd1b5ac24b79a70c870a7bc0b7bdb9babab7c1e \
+    --hash=sha256:c3084e590e857ba7585ae91078e4c9b6ef55aaf1dc343ce26400ba59a146eada \
+    --hash=sha256:c3f6feb0dc8456d025e566709f7dd02885add99bedaac50229013069242a1bfd \
+    --hash=sha256:d0439a970d65327de21c299ea0e0c2ad0987cdaf18ba5066621dea5f427f922b \
+    --hash=sha256:dd614370e939f9fceeeb2915111a0795271b4c11dfb5fc0f58449bee40c726a5 \
+    --hash=sha256:de9e20a0acb709dcfa15a622c91f584f12c9739a79c47999f73435d2b3cc8a3b \
+    --hash=sha256:e36fa838ac1d6c87198ca149cbfcc92e1af06bb8c8cd852622f8e58f33ea3324 \
+    --hash=sha256:e8d20308eeae15b3e182f47876f05acbdec1eebd9473a9814a44e46ec4a84c04
+    # via
+    #   -r requirements-dev.txt
+    #   tensorboard
+    #   tensorflow-cpu
+h5py==3.12.1 \
+    --hash=sha256:018a4597f35092ae3fb28ee851fdc756d2b88c96336b8480e124ce1ac6fb9166 \
+    --hash=sha256:050a4f2c9126054515169c49cb900949814987f0c7ae74c341b0c9f9b5056834 \
+    --hash=sha256:06a903a4e4e9e3ebbc8b548959c3c2552ca2d70dac14fcfa650d9261c66939ed \
+    --hash=sha256:1473348139b885393125126258ae2d70753ef7e9cec8e7848434f385ae72069e \
+    --hash=sha256:2f0f1a382cbf494679c07b4371f90c70391dedb027d517ac94fa2c05299dacda \
+    --hash=sha256:326d70b53d31baa61f00b8aa5f95c2fcb9621a3ee8365d770c551a13dbbcbfdf \
+    --hash=sha256:3b15d8dbd912c97541312c0e07438864d27dbca857c5ad634de68110c6beb1c2 \
+    --hash=sha256:3fdf95092d60e8130ba6ae0ef7a9bd4ade8edbe3569c13ebbaf39baefffc5ba4 \
+    --hash=sha256:4532c7e97fbef3d029735db8b6f5bf01222d9ece41e309b20d63cfaae2fb5c4d \
+    --hash=sha256:513171e90ed92236fc2ca363ce7a2fc6f2827375efcbb0cc7fbdd7fe11fecafc \
+    --hash=sha256:52ab036c6c97055b85b2a242cb540ff9590bacfda0c03dd0cf0661b311f522f8 \
+    --hash=sha256:577d618d6b6dea3da07d13cc903ef9634cde5596b13e832476dd861aaf651f3e \
+    --hash=sha256:59400f88343b79655a242068a9c900001a34b63e3afb040bd7cdf717e440f653 \
+    --hash=sha256:59685fe40d8c1fbbee088c88cd4da415a2f8bee5c270337dc5a1c4aa634e3307 \
+    --hash=sha256:5c4b41d1019322a5afc5082864dfd6359f8935ecd37c11ac0029be78c5d112c9 \
+    --hash=sha256:62be1fc0ef195891949b2c627ec06bc8e837ff62d5b911b6e42e38e0f20a897d \
+    --hash=sha256:6fdf6d7936fa824acfa27305fe2d9f39968e539d831c5bae0e0d83ed521ad1ac \
+    --hash=sha256:7b3b8f3b48717e46c6a790e3128d39c61ab595ae0a7237f06dfad6a3b51d5351 \
+    --hash=sha256:84342bffd1f82d4f036433e7039e241a243531a1d3acd7341b35ae58cdab05bf \
+    --hash=sha256:ad8a76557880aed5234cfe7279805f4ab5ce16b17954606cca90d578d3e713ef \
+    --hash=sha256:ba51c0c5e029bb5420a343586ff79d56e7455d496d18a30309616fdbeed1068f \
+    --hash=sha256:cb65f619dfbdd15e662423e8d257780f9a66677eae5b4b3fc9dca70b5fd2d2a3 \
+    --hash=sha256:ccd9006d92232727d23f784795191bfd02294a4f2ba68708825cb1da39511a93 \
+    --hash=sha256:d2b8dd64f127d8b324f5d2cd1c0fd6f68af69084e9e47d27efeb9e28e685af3e \
+    --hash=sha256:d3e465aee0ec353949f0f46bf6c6f9790a2006af896cee7c178a8c3e5090aa32 \
+    --hash=sha256:e4d51919110a030913201422fb07987db4338eba5ec8c5a15d6fab8e03d443fc
+    # via
+    #   keras
+    #   tensorflow-cpu
+httplib2==0.22.0 \
+    --hash=sha256:14ae0a53c1ba8f3d37e9e27cf37eabb0fb9980f435ba405d546948b009dd64dc \
+    --hash=sha256:d7a10bc5ef5ab08322488bde8c726eeee5c8618723fdb399597ec58f3d82df81
+    # via
+    #   google-api-python-client
+    #   google-auth-httplib2
+    #   oauth2client
+huggingface-hub==0.27.0 \
+    --hash=sha256:8f2e834517f1f1ddf1ecc716f91b120d7333011b7485f665a9a412eacb1a2a81 \
+    --hash=sha256:902cce1a1be5739f5589e560198a65a8edcfd3b830b1666f36e4b961f0454fac
+    # via
+    #   datasets
+    #   tokenizers
+    #   transformers
+humanize==4.11.0 \
+    --hash=sha256:b53caaec8532bcb2fff70c8826f904c35943f8cecaca29d272d9df38092736c0 \
+    --hash=sha256:e66f36020a2d5a974c504bd2555cf770621dbdbb6d82f94a6857c0b1ea2608be
+    # via orbax-checkpoint
+idna==3.10 \
+    --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
+    --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
+    # via
+    #   requests
+    #   yarl
+immutabledict==4.2.1 \
+    --hash=sha256:c56a26ced38c236f79e74af3ccce53772827cef5c3bce7cab33ff2060f756373 \
+    --hash=sha256:d91017248981c72eb66c8ff9834e99c2f53562346f23e7f51e7a5ebcf66a3bcc
+    # via tensorflow-datasets
+importlib-resources==6.5.2 \
+    --hash=sha256:185f87adef5bcc288449d98fb4fba07cea78bc036455dd44c5fc4a2fe78fed2c \
+    --hash=sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec
+    # via etils
+jax[cpu]==0.4.34 \
+    --hash=sha256:44196854f40c5f9cea3142824b9f1051f85afc3fcf7593ec5479fc8db01c58db \
+    --hash=sha256:b957ca1fc91f7343f91a186af9f19c7f342c946f95a8c11c7f1e5cdfe2e58d9e
+    # via
+    #   -r requirements-dev.txt
+    #   chex
+    #   flax
+    #   jraph
+    #   optax
+    #   orbax-checkpoint
+jaxlib==0.4.34 \
+    --hash=sha256:096f0ca309d41fa692a9d1f2f9baab1c5c8ca0749876ebb3f748e738a27c7ff4 \
+    --hash=sha256:133070d4fec5525ffea4dc72956398c1cf647a04dcb37f8a935ee82af78d9965 \
+    --hash=sha256:1a30771d85fa77f9ab8f18e63240f455ab3a3f87660ed7b8d5eea6ceecbe5c1e \
+    --hash=sha256:3bcfa639ca3cfaf86c8ceebd5fc0d47300fd98a078014a1d0cc03133e1523d5f \
+    --hash=sha256:3e60bc826933082e99b19b87c21818a8d26fcdb01f418d47cedff554746fd6cc \
+    --hash=sha256:45d719a2ce0ebf21255a277b71d756f3609b7b5be70cddc5d88fd58c35219de0 \
+    --hash=sha256:48272e9034ff868d4328cf0055a07882fd2be93f59dfb6283af7de491f9d1290 \
+    --hash=sha256:571ef03259835458111596a71a2f4a6fabf4ec34595df4cea555035362ac5bf0 \
+    --hash=sha256:6b43a974c5d91a19912d138f2658dd8dbb7d30dcdff5c961d896c673e872b611 \
+    --hash=sha256:72e22e99a5dc890a64443c3fc12f13f20091f578c405a76de077ba42b4c62cd7 \
+    --hash=sha256:7be673a876ebd1aef440fb7e3ebaf99a91abeb550c9728c644b7d7c7b5d7c108 \
+    --hash=sha256:87f25a477cd279840e53718403f97092eba0e8a945fcab47bcf435b6f9119dda \
+    --hash=sha256:8ee3f93836e53c86556ccd9449a4ea43516ee05184d031a71dd692e81259f7d9 \
+    --hash=sha256:901cb4040ed24eae40071d8114ea8d10dff436277fa74a1a5b9e7206f641151c \
+    --hash=sha256:b0001c8f0e2b1c7bc99e4f314b524a340d25653505c1a1484d4041a9d3617f6f \
+    --hash=sha256:b7a212a3cb5c6acc201c32ae4f4b5f5a9ac09457fbb77ba8db5ce7e7d4adc214 \
+    --hash=sha256:c303f5acaf6c56ce5ff133a923c9b6247bdebedde15bd2c893c24be4d8f71306 \
+    --hash=sha256:c7b3e724a30426a856070aba0192b5d199e95b4411070e7ad96ad8b196877b10 \
+    --hash=sha256:c9d3adcae43a33aad4332be9c2aedc5ef751d1e755f917a5afb30c7872eacaa8 \
+    --hash=sha256:d840e64b85f8865404d6d225b9bb340e158df1457152a361b05680e24792b232
+    # via
+    #   chex
+    #   jax
+    #   jraph
+    #   optax
+jinja2==3.1.5 \
+    --hash=sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb \
+    --hash=sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb
+    # via torch
+jmp==0.0.4 \
+    --hash=sha256:5dfeb0fd7c7a9f72a70fff0aab9d0cbfae32a809c02f4037ff3485ceb33e1730 \
+    --hash=sha256:6aa7adbddf2bd574b28c7faf6e81a735eb11f53386447896909c6968dc36807d
+    # via dm-haiku
+joblib==1.4.2 \
+    --hash=sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6 \
+    --hash=sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e
+    # via scikit-learn
+jraph==0.0.6.dev0 \
+    --hash=sha256:350fe37bf717f934f1f84fd3370a480b3178bfcb61dfa217c738971308c57625 \
+    --hash=sha256:c3ac3a0b224b344eb6d367e8bc312d95ea41bf825d01ea31b80dd8c22c0dd8b8
+    # via -r requirements-dev.txt
+keras==3.7.0 \
+    --hash=sha256:546a64f302e4779c129c06d9826fa586de752cdfd43d7dc4010c31b282587969 \
+    --hash=sha256:a4451a5591e75dfb414d0b84a3fd2fb9c0240cc87ebe7e397f547ce10b0e67b7
+    # via
+    #   -r requirements-dev.txt
+    #   tensorflow-cpu
+kiwisolver==1.4.8 \
+    --hash=sha256:01c3d31902c7db5fb6182832713d3b4122ad9317c2c5877d0539227d96bb2e50 \
+    --hash=sha256:034d2c891f76bd3edbdb3ea11140d8510dca675443da7304205a2eaa45d8334c \
+    --hash=sha256:085940635c62697391baafaaeabdf3dd7a6c3643577dde337f4d66eba021b2b8 \
+    --hash=sha256:08e77738ed7538f036cd1170cbed942ef749137b1311fa2bbe2a7fda2f6bf3cc \
+    --hash=sha256:111793b232842991be367ed828076b03d96202c19221b5ebab421ce8bcad016f \
+    --hash=sha256:11e1022b524bd48ae56c9b4f9296bce77e15a2e42a502cceba602f804b32bb79 \
+    --hash=sha256:151dffc4865e5fe6dafce5480fab84f950d14566c480c08a53c663a0020504b6 \
+    --hash=sha256:16523b40aab60426ffdebe33ac374457cf62863e330a90a0383639ce14bf44b2 \
+    --hash=sha256:1732e065704b47c9afca7ffa272f845300a4eb959276bf6970dc07265e73b605 \
+    --hash=sha256:1c8ceb754339793c24aee1c9fb2485b5b1f5bb1c2c214ff13368431e51fc9a09 \
+    --hash=sha256:23454ff084b07ac54ca8be535f4174170c1094a4cff78fbae4f73a4bcc0d4dab \
+    --hash=sha256:23d5f023bdc8c7e54eb65f03ca5d5bb25b601eac4d7f1a042888a1f45237987e \
+    --hash=sha256:257af1622860e51b1a9d0ce387bf5c2c4f36a90594cb9514f55b074bcc787cfc \
+    --hash=sha256:286b18e86682fd2217a48fc6be6b0f20c1d0ed10958d8dc53453ad58d7be0bf8 \
+    --hash=sha256:291331973c64bb9cce50bbe871fb2e675c4331dab4f31abe89f175ad7679a4d7 \
+    --hash=sha256:2f0121b07b356a22fb0414cec4666bbe36fd6d0d759db3d37228f496ed67c880 \
+    --hash=sha256:3452046c37c7692bd52b0e752b87954ef86ee2224e624ef7ce6cb21e8c41cc1b \
+    --hash=sha256:34d142fba9c464bc3bbfeff15c96eab0e7310343d6aefb62a79d51421fcc5f1b \
+    --hash=sha256:369b75d40abedc1da2c1f4de13f3482cb99e3237b38726710f4a793432b1c5ff \
+    --hash=sha256:36dbbfd34838500a31f52c9786990d00150860e46cd5041386f217101350f0d3 \
+    --hash=sha256:370fd2df41660ed4e26b8c9d6bbcad668fbe2560462cba151a721d49e5b6628c \
+    --hash=sha256:3a96c0e790ee875d65e340ab383700e2b4891677b7fcd30a699146f9384a2bb0 \
+    --hash=sha256:3b9b4d2892fefc886f30301cdd80debd8bb01ecdf165a449eb6e78f79f0fabd6 \
+    --hash=sha256:3cd3bc628b25f74aedc6d374d5babf0166a92ff1317f46267f12d2ed54bc1d30 \
+    --hash=sha256:3ddc373e0eef45b59197de815b1b28ef89ae3955e7722cc9710fb91cd77b7f47 \
+    --hash=sha256:4191ee8dfd0be1c3666ccbac178c5a05d5f8d689bbe3fc92f3c4abec817f8fe0 \
+    --hash=sha256:54a62808ac74b5e55a04a408cda6156f986cefbcf0ada13572696b507cc92fa1 \
+    --hash=sha256:577facaa411c10421314598b50413aa1ebcf5126f704f1e5d72d7e4e9f020d90 \
+    --hash=sha256:641f2ddf9358c80faa22e22eb4c9f54bd3f0e442e038728f500e3b978d00aa7d \
+    --hash=sha256:65ea09a5a3faadd59c2ce96dc7bf0f364986a315949dc6374f04396b0d60e09b \
+    --hash=sha256:68269e60ee4929893aad82666821aaacbd455284124817af45c11e50a4b42e3c \
+    --hash=sha256:69b5637c3f316cab1ec1c9a12b8c5f4750a4c4b71af9157645bf32830e39c03a \
+    --hash=sha256:7506488470f41169b86d8c9aeff587293f530a23a23a49d6bc64dab66bedc71e \
+    --hash=sha256:768cade2c2df13db52475bd28d3a3fac8c9eff04b0e9e2fda0f3760f20b3f7fc \
+    --hash=sha256:77e6f57a20b9bd4e1e2cedda4d0b986ebd0216236f0106e55c28aea3d3d69b16 \
+    --hash=sha256:782bb86f245ec18009890e7cb8d13a5ef54dcf2ebe18ed65f795e635a96a1c6a \
+    --hash=sha256:7a3ad337add5148cf51ce0b55642dc551c0b9d6248458a757f98796ca7348712 \
+    --hash=sha256:7cd2785b9391f2873ad46088ed7599a6a71e762e1ea33e87514b1a441ed1da1c \
+    --hash=sha256:7e9a60b50fe8b2ec6f448fe8d81b07e40141bfced7f896309df271a0b92f80f3 \
+    --hash=sha256:84a2f830d42707de1d191b9490ac186bf7997a9495d4e9072210a1296345f7dc \
+    --hash=sha256:856b269c4d28a5c0d5e6c1955ec36ebfd1651ac00e1ce0afa3e28da95293b561 \
+    --hash=sha256:858416b7fb777a53f0c59ca08190ce24e9abbd3cffa18886a5781b8e3e26f65d \
+    --hash=sha256:87b287251ad6488e95b4f0b4a79a6d04d3ea35fde6340eb38fbd1ca9cd35bbbc \
+    --hash=sha256:88c6f252f6816a73b1f8c904f7bbe02fd67c09a69f7cb8a0eecdbf5ce78e63db \
+    --hash=sha256:893f5525bb92d3d735878ec00f781b2de998333659507d29ea4466208df37bed \
+    --hash=sha256:89c107041f7b27844179ea9c85d6da275aa55ecf28413e87624d033cf1f6b751 \
+    --hash=sha256:918139571133f366e8362fa4a297aeba86c7816b7ecf0bc79168080e2bd79957 \
+    --hash=sha256:99cea8b9dd34ff80c521aef46a1dddb0dcc0283cf18bde6d756f1e6f31772165 \
+    --hash=sha256:a17b7c4f5b2c51bb68ed379defd608a03954a1845dfed7cc0117f1cc8a9b7fd2 \
+    --hash=sha256:a3c44cb68861de93f0c4a8175fbaa691f0aa22550c331fefef02b618a9dcb476 \
+    --hash=sha256:a4d3601908c560bdf880f07d94f31d734afd1bb71e96585cace0e38ef44c6d84 \
+    --hash=sha256:a5ce1e481a74b44dd5e92ff03ea0cb371ae7a0268318e202be06c8f04f4f1246 \
+    --hash=sha256:a66f60f8d0c87ab7f59b6fb80e642ebb29fec354a4dfad687ca4092ae69d04f4 \
+    --hash=sha256:b21dbe165081142b1232a240fc6383fd32cdd877ca6cc89eab93e5f5883e1c25 \
+    --hash=sha256:b47a465040146981dc9db8647981b8cb96366fbc8d452b031e4f8fdffec3f26d \
+    --hash=sha256:b5773efa2be9eb9fcf5415ea3ab70fc785d598729fd6057bea38d539ead28271 \
+    --hash=sha256:b83dc6769ddbc57613280118fb4ce3cd08899cc3369f7d0e0fab518a7cf37fdb \
+    --hash=sha256:bade438f86e21d91e0cf5dd7c0ed00cda0f77c8c1616bd83f9fc157fa6760d31 \
+    --hash=sha256:bcb1ebc3547619c3b58a39e2448af089ea2ef44b37988caf432447374941574e \
+    --hash=sha256:be4816dc51c8a471749d664161b434912eee82f2ea66bd7628bd14583a833e85 \
+    --hash=sha256:c07b29089b7ba090b6f1a669f1411f27221c3662b3a1b7010e67b59bb5a6f10b \
+    --hash=sha256:c2b9a96e0f326205af81a15718a9073328df1173a2619a68553decb7097fd5d7 \
+    --hash=sha256:c5020c83e8553f770cb3b5fc13faac40f17e0b205bd237aebd21d53d733adb03 \
+    --hash=sha256:c72941acb7b67138f35b879bbe85be0f6c6a70cab78fe3ef6db9c024d9223e5b \
+    --hash=sha256:c8bf637892dc6e6aad2bc6d4d69d08764166e5e3f69d469e55427b6ac001b19d \
+    --hash=sha256:cc978a80a0db3a66d25767b03688f1147a69e6237175c0f4ffffaaedf744055a \
+    --hash=sha256:ce2cf1e5688edcb727fdf7cd1bbd0b6416758996826a8be1d958f91880d0809d \
+    --hash=sha256:d47b28d1dfe0793d5e96bce90835e17edf9a499b53969b03c6c47ea5985844c3 \
+    --hash=sha256:d47cfb2650f0e103d4bf68b0b5804c68da97272c84bb12850d877a95c056bd67 \
+    --hash=sha256:d5536185fce131780ebd809f8e623bf4030ce1b161353166c49a3c74c287897f \
+    --hash=sha256:d561d2d8883e0819445cfe58d7ddd673e4015c3c57261d7bdcd3710d0d14005c \
+    --hash=sha256:d6af5e8815fd02997cb6ad9bbed0ee1e60014438ee1a5c2444c96f87b8843502 \
+    --hash=sha256:d6d6bd87df62c27d4185de7c511c6248040afae67028a8a22012b010bc7ad062 \
+    --hash=sha256:dace81d28c787956bfbfbbfd72fdcef014f37d9b48830829e488fdb32b49d954 \
+    --hash=sha256:e063ef9f89885a1d68dd8b2e18f5ead48653176d10a0e324e3b0030e3a69adeb \
+    --hash=sha256:e7a019419b7b510f0f7c9dceff8c5eae2392037eae483a7f9162625233802b0a \
+    --hash=sha256:eaa973f1e05131de5ff3569bbba7f5fd07ea0595d3870ed4a526d486fe57fa1b \
+    --hash=sha256:eb158fe28ca0c29f2260cca8c43005329ad58452c36f0edf298204de32a9a3ed \
+    --hash=sha256:ed33ca2002a779a2e20eeb06aea7721b6e47f2d4b8a8ece979d8ba9e2a167e34 \
+    --hash=sha256:fc2ace710ba7c1dfd1a3b42530b62b9ceed115f19a1656adefce7b1782a37794
+    # via matplotlib
+libclang==18.1.1 \
+    --hash=sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a \
+    --hash=sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8 \
+    --hash=sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb \
+    --hash=sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592 \
+    --hash=sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f \
+    --hash=sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5 \
+    --hash=sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8 \
+    --hash=sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250 \
+    --hash=sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b \
+    --hash=sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe
+    # via tensorflow-cpu
+markdown==3.7 \
+    --hash=sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2 \
+    --hash=sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803
+    # via tensorboard
+markdown-it-py==3.0.0 \
+    --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
+    --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
+    # via rich
+markupsafe==3.0.2 \
+    --hash=sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4 \
+    --hash=sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30 \
+    --hash=sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0 \
+    --hash=sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9 \
+    --hash=sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 \
+    --hash=sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13 \
+    --hash=sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028 \
+    --hash=sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca \
+    --hash=sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557 \
+    --hash=sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832 \
+    --hash=sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0 \
+    --hash=sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b \
+    --hash=sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579 \
+    --hash=sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a \
+    --hash=sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c \
+    --hash=sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff \
+    --hash=sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c \
+    --hash=sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22 \
+    --hash=sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094 \
+    --hash=sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb \
+    --hash=sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e \
+    --hash=sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5 \
+    --hash=sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a \
+    --hash=sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d \
+    --hash=sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a \
+    --hash=sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b \
+    --hash=sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8 \
+    --hash=sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225 \
+    --hash=sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c \
+    --hash=sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144 \
+    --hash=sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f \
+    --hash=sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87 \
+    --hash=sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d \
+    --hash=sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93 \
+    --hash=sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf \
+    --hash=sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158 \
+    --hash=sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84 \
+    --hash=sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb \
+    --hash=sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48 \
+    --hash=sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171 \
+    --hash=sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c \
+    --hash=sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6 \
+    --hash=sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd \
+    --hash=sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d \
+    --hash=sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1 \
+    --hash=sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d \
+    --hash=sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca \
+    --hash=sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a \
+    --hash=sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29 \
+    --hash=sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe \
+    --hash=sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798 \
+    --hash=sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c \
+    --hash=sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8 \
+    --hash=sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f \
+    --hash=sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f \
+    --hash=sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a \
+    --hash=sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178 \
+    --hash=sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0 \
+    --hash=sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79 \
+    --hash=sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430 \
+    --hash=sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50
+    # via
+    #   jinja2
+    #   werkzeug
+matplotlib==3.10.0 \
+    --hash=sha256:01d2b19f13aeec2e759414d3bfe19ddfb16b13a1250add08d46d5ff6f9be83c6 \
+    --hash=sha256:12eaf48463b472c3c0f8dbacdbf906e573013df81a0ab82f0616ea4b11281908 \
+    --hash=sha256:2c5829a5a1dd5a71f0e31e6e8bb449bc0ee9dbfb05ad28fc0c6b55101b3a4be6 \
+    --hash=sha256:2fbbabc82fde51391c4da5006f965e36d86d95f6ee83fb594b279564a4c5d0d2 \
+    --hash=sha256:3547d153d70233a8496859097ef0312212e2689cdf8d7ed764441c77604095ae \
+    --hash=sha256:359f87baedb1f836ce307f0e850d12bb5f1936f70d035561f90d41d305fdacea \
+    --hash=sha256:3b427392354d10975c1d0f4ee18aa5844640b512d5311ef32efd4dd7db106ede \
+    --hash=sha256:4659665bc7c9b58f8c00317c3c2a299f7f258eeae5a5d56b4c64226fca2f7c59 \
+    --hash=sha256:4673ff67a36152c48ddeaf1135e74ce0d4bce1bbf836ae40ed39c29edf7e2765 \
+    --hash=sha256:503feb23bd8c8acc75541548a1d709c059b7184cde26314896e10a9f14df5f12 \
+    --hash=sha256:5439f4c5a3e2e8eab18e2f8c3ef929772fd5641876db71f08127eed95ab64683 \
+    --hash=sha256:5cdbaf909887373c3e094b0318d7ff230b2ad9dcb64da7ade654182872ab2593 \
+    --hash=sha256:5e6c6461e1fc63df30bf6f80f0b93f5b6784299f721bc28530477acd51bfc3d1 \
+    --hash=sha256:5fd41b0ec7ee45cd960a8e71aea7c946a28a0b8a4dcee47d2856b2af051f334c \
+    --hash=sha256:607b16c8a73943df110f99ee2e940b8a1cbf9714b65307c040d422558397dac5 \
+    --hash=sha256:7e8632baebb058555ac0cde75db885c61f1212e47723d63921879806b40bec6a \
+    --hash=sha256:81713dd0d103b379de4516b861d964b1d789a144103277769238c732229d7f03 \
+    --hash=sha256:845d96568ec873be63f25fa80e9e7fae4be854a66a7e2f0c8ccc99e94a8bd4ef \
+    --hash=sha256:95b710fea129c76d30be72c3b38f330269363fbc6e570a5dd43580487380b5ff \
+    --hash=sha256:96f2886f5c1e466f21cc41b70c5a0cd47bfa0015eb2d5793c88ebce658600e25 \
+    --hash=sha256:994c07b9d9fe8d25951e3202a68c17900679274dadfc1248738dcfa1bd40d7f3 \
+    --hash=sha256:9ade1003376731a971e398cc4ef38bb83ee8caf0aee46ac6daa4b0506db1fd06 \
+    --hash=sha256:9b0558bae37f154fffda54d779a592bc97ca8b4701f1c710055b609a3bac44c8 \
+    --hash=sha256:a2a43cbefe22d653ab34bb55d42384ed30f611bcbdea1f8d7f431011a2e1c62e \
+    --hash=sha256:a994f29e968ca002b50982b27168addfd65f0105610b6be7fa515ca4b5307c95 \
+    --hash=sha256:ad2e15300530c1a94c63cfa546e3b7864bd18ea2901317bae8bbf06a5ade6dcf \
+    --hash=sha256:ae80dc3a4add4665cf2faa90138384a7ffe2a4e37c58d83e115b54287c4f06ef \
+    --hash=sha256:b886d02a581b96704c9d1ffe55709e49b4d2d52709ccebc4be42db856e511278 \
+    --hash=sha256:c40ba2eb08b3f5de88152c2333c58cee7edcead0a2a0d60fcafa116b17117adc \
+    --hash=sha256:c55b20591ced744aa04e8c3e4b7543ea4d650b6c3c4b208c08a05b4010e8b442 \
+    --hash=sha256:c58a9622d5dbeb668f407f35f4e6bfac34bb9ecdcc81680c04d0258169747997 \
+    --hash=sha256:d44cb942af1693cced2604c33a9abcef6205601c445f6d0dc531d813af8a2f5a \
+    --hash=sha256:d907fddb39f923d011875452ff1eca29a9e7f21722b873e90db32e5d8ddff12e \
+    --hash=sha256:fd44fc75522f58612ec4a33958a7e5552562b7705b42ef1b4f8c0818e304a363
+    # via plotnine
+mdurl==0.1.2 \
+    --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
+    --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
+    # via markdown-it-py
+mizani==0.13.1 \
+    --hash=sha256:7da0dcacd43fbcc01c279ea06a76f1f064ae90dbb387c4a985ba24a92d3c7d7a \
+    --hash=sha256:e3247ea12c746c8104767d7e42a2d16473173c7bc314f298d8294a58f4653353
+    # via plotnine
+ml-dtypes==0.4.1 \
+    --hash=sha256:126e7d679b8676d1a958f2651949fbfa182832c3cd08020d8facd94e4114f3e9 \
+    --hash=sha256:15fdd922fea57e493844e5abb930b9c0bd0af217d9edd3724479fc3d7ce70e3f \
+    --hash=sha256:1fe8b5b5e70cd67211db94b05cfd58dace592f24489b038dc6f9fe347d2e07d5 \
+    --hash=sha256:274cc7193dd73b35fb26bef6c5d40ae3eb258359ee71cd82f6e96a8c948bdaa6 \
+    --hash=sha256:2d55b588116a7085d6e074cf0cdb1d6fa3875c059dddc4d2c94a4cc81c23e975 \
+    --hash=sha256:560be16dc1e3bdf7c087eb727e2cf9c0e6a3d87e9f415079d2491cc419b3ebf5 \
+    --hash=sha256:74c6cfb5cf78535b103fde9ea3ded8e9f16f75bc07789054edc7776abfb3d752 \
+    --hash=sha256:772426b08a6172a891274d581ce58ea2789cc8abc1c002a27223f314aaf894e7 \
+    --hash=sha256:827d3ca2097085cf0355f8fdf092b888890bb1b1455f52801a2d7756f056f54b \
+    --hash=sha256:8c09a6d11d8475c2a9fd2bc0695628aec105f97cab3b3a3fb7c9660348ff7d24 \
+    --hash=sha256:9f5e8f75fa371020dd30f9196e7d73babae2abd51cf59bdd56cb4f8de7e13354 \
+    --hash=sha256:ad0b757d445a20df39035c4cdeed457ec8b60d236020d2560dbc25887533cf50 \
+    --hash=sha256:df0fb650d5c582a9e72bb5bd96cfebb2cdb889d89daff621c8fbc60295eba66c \
+    --hash=sha256:e138a9b7a48079c900ea969341a5754019a1ad17ae27ee330f7ebf43f23877f9 \
+    --hash=sha256:e35e486e97aee577d0890bc3bd9e9f9eece50c08c163304008587ec8cfe7575b \
+    --hash=sha256:ef0d7e3fece227b49b544fa69e50e607ac20948f0043e9f76b44f35f229ea450 \
+    --hash=sha256:fad5f2de464fd09127e49b7fd1252b9006fb43d2edc1ff112d390c324af5ca7a
+    # via
+    #   jax
+    #   jaxlib
+    #   keras
+    #   tensorflow-cpu
+    #   tensorstore
+mpmath==1.3.0 \
+    --hash=sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f \
+    --hash=sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c
+    # via sympy
+msgpack==1.1.0 \
+    --hash=sha256:06f5fd2f6bb2a7914922d935d3b8bb4a7fff3a9a91cfce6d06c13bc42bec975b \
+    --hash=sha256:071603e2f0771c45ad9bc65719291c568d4edf120b44eb36324dcb02a13bfddf \
+    --hash=sha256:0907e1a7119b337971a689153665764adc34e89175f9a34793307d9def08e6ca \
+    --hash=sha256:0f92a83b84e7c0749e3f12821949d79485971f087604178026085f60ce109330 \
+    --hash=sha256:115a7af8ee9e8cddc10f87636767857e7e3717b7a2e97379dc2054712693e90f \
+    --hash=sha256:13599f8829cfbe0158f6456374e9eea9f44eee08076291771d8ae93eda56607f \
+    --hash=sha256:17fb65dd0bec285907f68b15734a993ad3fc94332b5bb21b0435846228de1f39 \
+    --hash=sha256:2137773500afa5494a61b1208619e3871f75f27b03bcfca7b3a7023284140247 \
+    --hash=sha256:3180065ec2abbe13a4ad37688b61b99d7f9e012a535b930e0e683ad6bc30155b \
+    --hash=sha256:398b713459fea610861c8a7b62a6fec1882759f308ae0795b5413ff6a160cf3c \
+    --hash=sha256:3d364a55082fb2a7416f6c63ae383fbd903adb5a6cf78c5b96cc6316dc1cedc7 \
+    --hash=sha256:3df7e6b05571b3814361e8464f9304c42d2196808e0119f55d0d3e62cd5ea044 \
+    --hash=sha256:41c991beebf175faf352fb940bf2af9ad1fb77fd25f38d9142053914947cdbf6 \
+    --hash=sha256:42f754515e0f683f9c79210a5d1cad631ec3d06cea5172214d2176a42e67e19b \
+    --hash=sha256:452aff037287acb1d70a804ffd022b21fa2bb7c46bee884dbc864cc9024128a0 \
+    --hash=sha256:4676e5be1b472909b2ee6356ff425ebedf5142427842aa06b4dfd5117d1ca8a2 \
+    --hash=sha256:46c34e99110762a76e3911fc923222472c9d681f1094096ac4102c18319e6468 \
+    --hash=sha256:471e27a5787a2e3f974ba023f9e265a8c7cfd373632247deb225617e3100a3c7 \
+    --hash=sha256:4a1964df7b81285d00a84da4e70cb1383f2e665e0f1f2a7027e683956d04b734 \
+    --hash=sha256:4b51405e36e075193bc051315dbf29168d6141ae2500ba8cd80a522964e31434 \
+    --hash=sha256:4d1b7ff2d6146e16e8bd665ac726a89c74163ef8cd39fa8c1087d4e52d3a2325 \
+    --hash=sha256:53258eeb7a80fc46f62fd59c876957a2d0e15e6449a9e71842b6d24419d88ca1 \
+    --hash=sha256:534480ee5690ab3cbed89d4c8971a5c631b69a8c0883ecfea96c19118510c846 \
+    --hash=sha256:58638690ebd0a06427c5fe1a227bb6b8b9fdc2bd07701bec13c2335c82131a88 \
+    --hash=sha256:58dfc47f8b102da61e8949708b3eafc3504509a5728f8b4ddef84bd9e16ad420 \
+    --hash=sha256:59caf6a4ed0d164055ccff8fe31eddc0ebc07cf7326a2aaa0dbf7a4001cd823e \
+    --hash=sha256:5dbad74103df937e1325cc4bfeaf57713be0b4f15e1c2da43ccdd836393e2ea2 \
+    --hash=sha256:5e1da8f11a3dd397f0a32c76165cf0c4eb95b31013a94f6ecc0b280c05c91b59 \
+    --hash=sha256:646afc8102935a388ffc3914b336d22d1c2d6209c773f3eb5dd4d6d3b6f8c1cb \
+    --hash=sha256:64fc9068d701233effd61b19efb1485587560b66fe57b3e50d29c5d78e7fef68 \
+    --hash=sha256:65553c9b6da8166e819a6aa90ad15288599b340f91d18f60b2061f402b9a4915 \
+    --hash=sha256:685ec345eefc757a7c8af44a3032734a739f8c45d1b0ac45efc5d8977aa4720f \
+    --hash=sha256:6ad622bf7756d5a497d5b6836e7fc3752e2dd6f4c648e24b1803f6048596f701 \
+    --hash=sha256:73322a6cc57fcee3c0c57c4463d828e9428275fb85a27aa2aa1a92fdc42afd7b \
+    --hash=sha256:74bed8f63f8f14d75eec75cf3d04ad581da6b914001b474a5d3cd3372c8cc27d \
+    --hash=sha256:79ec007767b9b56860e0372085f8504db5d06bd6a327a335449508bbee9648fa \
+    --hash=sha256:7a946a8992941fea80ed4beae6bff74ffd7ee129a90b4dd5cf9c476a30e9708d \
+    --hash=sha256:7ad442d527a7e358a469faf43fda45aaf4ac3249c8310a82f0ccff9164e5dccd \
+    --hash=sha256:7c9a35ce2c2573bada929e0b7b3576de647b0defbd25f5139dcdaba0ae35a4cc \
+    --hash=sha256:7e7b853bbc44fb03fbdba34feb4bd414322180135e2cb5164f20ce1c9795ee48 \
+    --hash=sha256:879a7b7b0ad82481c52d3c7eb99bf6f0645dbdec5134a4bddbd16f3506947feb \
+    --hash=sha256:8a706d1e74dd3dea05cb54580d9bd8b2880e9264856ce5068027eed09680aa74 \
+    --hash=sha256:8a84efb768fb968381e525eeeb3d92857e4985aacc39f3c47ffd00eb4509315b \
+    --hash=sha256:8cf9e8c3a2153934a23ac160cc4cba0ec035f6867c8013cc6077a79823370346 \
+    --hash=sha256:8da4bf6d54ceed70e8861f833f83ce0814a2b72102e890cbdfe4b34764cdd66e \
+    --hash=sha256:8e59bca908d9ca0de3dc8684f21ebf9a690fe47b6be93236eb40b99af28b6ea6 \
+    --hash=sha256:914571a2a5b4e7606997e169f64ce53a8b1e06f2cf2c3a7273aa106236d43dd5 \
+    --hash=sha256:a51abd48c6d8ac89e0cfd4fe177c61481aca2d5e7ba42044fd218cfd8ea9899f \
+    --hash=sha256:a52a1f3a5af7ba1c9ace055b659189f6c669cf3657095b50f9602af3a3ba0fe5 \
+    --hash=sha256:ad33e8400e4ec17ba782f7b9cf868977d867ed784a1f5f2ab46e7ba53b6e1e1b \
+    --hash=sha256:b4c01941fd2ff87c2a934ee6055bda4ed353a7846b8d4f341c428109e9fcde8c \
+    --hash=sha256:bce7d9e614a04d0883af0b3d4d501171fbfca038f12c77fa838d9f198147a23f \
+    --hash=sha256:c40ffa9a15d74e05ba1fe2681ea33b9caffd886675412612d93ab17b58ea2fec \
+    --hash=sha256:c5a91481a3cc573ac8c0d9aace09345d989dc4a0202b7fcb312c88c26d4e71a8 \
+    --hash=sha256:c921af52214dcbb75e6bdf6a661b23c3e6417f00c603dd2070bccb5c3ef499f5 \
+    --hash=sha256:d46cf9e3705ea9485687aa4001a76e44748b609d260af21c4ceea7f2212a501d \
+    --hash=sha256:d8ce0b22b890be5d252de90d0e0d119f363012027cf256185fc3d474c44b1b9e \
+    --hash=sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e \
+    --hash=sha256:e0856a2b7e8dcb874be44fea031d22e5b3a19121be92a1e098f46068a11b0870 \
+    --hash=sha256:e1f3c3d21f7cf67bcf2da8e494d30a75e4cf60041d98b3f79875afb5b96f3a3f \
+    --hash=sha256:f1ba6136e650898082d9d5a5217d5906d1e138024f836ff48691784bbe1adf96 \
+    --hash=sha256:f3e9b4936df53b970513eac1758f3882c88658a220b58dcc1e39606dccaaf01c \
+    --hash=sha256:f80bc7d47f76089633763f952e67f8214cb7b3ee6bfa489b3cb6a84cfac114cd \
+    --hash=sha256:fd2906780f25c8ed5d7b323379f6138524ba793428db5d0e9d226d3fa6aa1788
+    # via
+    #   flax
+    #   orbax-checkpoint
+multidict==6.1.0 \
+    --hash=sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f \
+    --hash=sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056 \
+    --hash=sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761 \
+    --hash=sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3 \
+    --hash=sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b \
+    --hash=sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6 \
+    --hash=sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748 \
+    --hash=sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966 \
+    --hash=sha256:16e5f4bf4e603eb1fdd5d8180f1a25f30056f22e55ce51fb3d6ad4ab29f7d96f \
+    --hash=sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1 \
+    --hash=sha256:189f652a87e876098bbc67b4da1049afb5f5dfbaa310dd67c594b01c10388db6 \
+    --hash=sha256:1ca0083e80e791cffc6efce7660ad24af66c8d4079d2a750b29001b53ff59ada \
+    --hash=sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305 \
+    --hash=sha256:2090f6a85cafc5b2db085124d752757c9d251548cedabe9bd31afe6363e0aff2 \
+    --hash=sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d \
+    --hash=sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a \
+    --hash=sha256:22f3105d4fb15c8f57ff3959a58fcab6ce36814486500cd7485651230ad4d4ef \
+    --hash=sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c \
+    --hash=sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb \
+    --hash=sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60 \
+    --hash=sha256:3702ea6872c5a2a4eeefa6ffd36b042e9773f05b1f37ae3ef7264b1163c2dcf6 \
+    --hash=sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4 \
+    --hash=sha256:3914f5aaa0f36d5d60e8ece6a308ee1c9784cd75ec8151062614657a114c4478 \
+    --hash=sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81 \
+    --hash=sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7 \
+    --hash=sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56 \
+    --hash=sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3 \
+    --hash=sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6 \
+    --hash=sha256:483a6aea59cb89904e1ceabd2b47368b5600fb7de78a6e4a2c2987b2d256cf30 \
+    --hash=sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb \
+    --hash=sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506 \
+    --hash=sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0 \
+    --hash=sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925 \
+    --hash=sha256:4e18b656c5e844539d506a0a06432274d7bd52a7487e6828c63a63d69185626c \
+    --hash=sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6 \
+    --hash=sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e \
+    --hash=sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95 \
+    --hash=sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2 \
+    --hash=sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133 \
+    --hash=sha256:5845c1fd4866bb5dd3125d89b90e57ed3138241540897de748cdf19de8a2fca2 \
+    --hash=sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa \
+    --hash=sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3 \
+    --hash=sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3 \
+    --hash=sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436 \
+    --hash=sha256:682b987361e5fd7a139ed565e30d81fd81e9629acc7d925a205366877d8c8657 \
+    --hash=sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581 \
+    --hash=sha256:6bb5992037f7a9eff7991ebe4273ea7f51f1c1c511e6a2ce511d0e7bdb754492 \
+    --hash=sha256:73eae06aa53af2ea5270cc066dcaf02cc60d2994bbb2c4ef5764949257d10f43 \
+    --hash=sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2 \
+    --hash=sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2 \
+    --hash=sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926 \
+    --hash=sha256:87701f25a2352e5bf7454caa64757642734da9f6b11384c1f9d1a8e699758057 \
+    --hash=sha256:9079dfc6a70abe341f521f78405b8949f96db48da98aeb43f9907f342f627cdc \
+    --hash=sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80 \
+    --hash=sha256:957cf8e4b6e123a9eea554fa7ebc85674674b713551de587eb318a2df3e00255 \
+    --hash=sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1 \
+    --hash=sha256:9f636b730f7e8cb19feb87094949ba54ee5357440b9658b2a32a5ce4bce53972 \
+    --hash=sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53 \
+    --hash=sha256:a185f876e69897a6f3325c3f19f26a297fa058c5e456bfcff8015e9a27e83ae1 \
+    --hash=sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423 \
+    --hash=sha256:aa466da5b15ccea564bdab9c89175c762bc12825f4659c11227f515cee76fa4a \
+    --hash=sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160 \
+    --hash=sha256:ab7c4ceb38d91570a650dba194e1ca87c2b543488fe9309b4212694174fd539c \
+    --hash=sha256:ac10f4c2b9e770c4e393876e35a7046879d195cd123b4f116d299d442b335bcd \
+    --hash=sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa \
+    --hash=sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5 \
+    --hash=sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b \
+    --hash=sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa \
+    --hash=sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef \
+    --hash=sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44 \
+    --hash=sha256:c08be4f460903e5a9d0f76818db3250f12e9c344e79314d1d570fc69d7f4eae4 \
+    --hash=sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156 \
+    --hash=sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753 \
+    --hash=sha256:ce2186a7df133a9c895dea3331ddc5ddad42cdd0d1ea2f0a51e5d161e4762f28 \
+    --hash=sha256:d093be959277cb7dee84b801eb1af388b6ad3ca6a6b6bf1ed7585895789d027d \
+    --hash=sha256:d094ddec350a2fb899fec68d8353c78233debde9b7d8b4beeafa70825f1c281a \
+    --hash=sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304 \
+    --hash=sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008 \
+    --hash=sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429 \
+    --hash=sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72 \
+    --hash=sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399 \
+    --hash=sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3 \
+    --hash=sha256:db7457bac39421addd0c8449933ac32d8042aae84a14911a757ae6ca3eef1392 \
+    --hash=sha256:e27bbb6d14416713a8bd7aaa1313c0fc8d44ee48d74497a0ff4c3a1b6ccb5167 \
+    --hash=sha256:e617fb6b0b6953fffd762669610c1c4ffd05632c138d61ac7e14ad187870669c \
+    --hash=sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774 \
+    --hash=sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351 \
+    --hash=sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76 \
+    --hash=sha256:f4c035da3f544b1882bac24115f3e2e8760f10a0107614fc9839fd232200b875 \
+    --hash=sha256:f67f217af4b1ff66c68a87318012de788dd95fcfeb24cc889011f4e1c7454dfd \
+    --hash=sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28 \
+    --hash=sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.17 \
+    --hash=sha256:1d52f068357acd1e5bbc670b273ef8f81d57863235d9fbf9314751886e141968 \
+    --hash=sha256:20c28ca19079a6c879258103a6d60b94d4ffe2d9da07dda93fb1c8bc6243f522 \
+    --hash=sha256:27b8409c02b5dd89d336107c101dfbd1530a2cd4fd425fc27dcb7adb6e0b47bf \
+    --hash=sha256:2818af14c52446b9617d1b0755fa70ca2f77c28b25ed97bdaa2c69a22c47b46c \
+    --hash=sha256:2884701445d0177aec5bd5f6ee0df296773e4fb65b11903b94c613fb46cfb7d1 \
+    --hash=sha256:2b12e081df87ab755190e227341b2c3b17ee6587e9c82fecddcbe6aa812cd7f7 \
+    --hash=sha256:2ea0939b0f4760a16a548942c65c76ff5afd81fbf1083c56ae75e21faf92e426 \
+    --hash=sha256:349525099a0c9ac5936f0488b5ee73199098dac3ac899d81d326d238f9fd3ccd \
+    --hash=sha256:38357ca266b51a2e22841b755d9a91e4bb7b937979a54d411677111716c32744 \
+    --hash=sha256:4ae2f11a3416809ebc9a48abfc8b14ecce0652a0944731a1493a3c1ba44ff57a \
+    --hash=sha256:7ddb24e5bcdb64e90ec5543a1f05a39463068b6d3b804aa3f2a4e16ec28562d6 \
+    --hash=sha256:a0f01cd9d079af7a8296f521dc03859d1a414d14c1e2b6e676ef789333421c95 \
+    --hash=sha256:a22a6b1a482b80eab53078418bb0f7025e4f7d93cc8e1f36481477a023884861 \
+    --hash=sha256:c2c82d0375baed8d8dd0d8c38eb87c5ae9c471f8e384ad203a36f095ee860f67 \
+    --hash=sha256:c3feb874ba574fbccfb335980020c1ac631fbf2a3f7bee4e2042ede62558a021 \
+    --hash=sha256:d729f55198a3579f6879766a6d9b72b42d4b320c0dcb7844afb774d75b573c62
+    # via
+    #   -r requirements-dev.txt
+    #   datasets
+namex==0.0.8 \
+    --hash=sha256:32a50f6c565c0bb10aa76298c959507abdc0e850efe085dc38f3440fcb3aa90b \
+    --hash=sha256:7ddb6c2bb0e753a311b7590f84f6da659dd0c05e65cb89d519d54c0a250c0487
+    # via keras
+nest-asyncio==1.6.0 \
+    --hash=sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe \
+    --hash=sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c
+    # via orbax-checkpoint
+networkx==3.4.2 \
+    --hash=sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1 \
+    --hash=sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f
+    # via torch
+numpy==1.26.4 \
+    --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
+    --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
+    --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \
+    --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \
+    --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \
+    --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \
+    --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \
+    --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \
+    --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \
+    --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \
+    --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \
+    --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \
+    --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \
+    --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \
+    --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \
+    --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \
+    --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \
+    --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \
+    --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \
+    --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \
+    --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \
+    --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \
+    --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \
+    --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \
+    --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \
+    --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \
+    --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \
+    --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \
+    --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \
+    --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \
+    --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \
+    --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \
+    --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \
+    --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \
+    --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \
+    --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f
+    # via
+    #   -r requirements-dev.txt
+    #   chex
+    #   contourpy
+    #   datasets
+    #   dm-haiku
+    #   etils
+    #   flax
+    #   h5py
+    #   jax
+    #   jaxlib
+    #   jmp
+    #   jraph
+    #   keras
+    #   matplotlib
+    #   mizani
+    #   ml-dtypes
+    #   optax
+    #   orbax-checkpoint
+    #   pandas
+    #   patsy
+    #   plotnine
+    #   scikit-learn
+    #   scipy
+    #   statsmodels
+    #   tensorboard
+    #   tensorflow-cpu
+    #   tensorflow-datasets
+    #   tensorstore
+    #   torchvision
+    #   transformers
+nvidia-cublas-cu12==12.1.3.1 \
+    --hash=sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906 \
+    --hash=sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105 \
+    --hash=sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4 \
+    --hash=sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105 \
+    --hash=sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed \
+    --hash=sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105 \
+    --hash=sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40 \
+    --hash=sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26 \
+    --hash=sha256:5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9
+    # via torch
+nvidia-cufft-cu12==11.0.2.54 \
+    --hash=sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56 \
+    --hash=sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253
+    # via torch
+nvidia-curand-cu12==10.3.2.106 \
+    --hash=sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a \
+    --hash=sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107 \
+    --hash=sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5 \
+    --hash=sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106 \
+    --hash=sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a \
+    --hash=sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.20.5 \
+    --hash=sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56 \
+    --hash=sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01
+    # via torch
+nvidia-nvjitlink-cu12==12.6.85 \
+    --hash=sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41 \
+    --hash=sha256:e61120e52ed675747825cdd16febc6a0730537451d867ee58bee3853b1b13d1c \
+    --hash=sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105 \
+    --hash=sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82 \
+    --hash=sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5
+    # via torch
+oauth2client==4.1.3 \
+    --hash=sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac \
+    --hash=sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6
+    # via cloud-tpu-client
+opt-einsum==3.4.0 \
+    --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \
+    --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac
+    # via
+    #   jax
+    #   tensorflow-cpu
+optax==0.2.4 \
+    --hash=sha256:4e05d3d5307e6dde4c319187ae36e6cd3a0c035d4ed25e9e992449a304f47336 \
+    --hash=sha256:db35c04e50b52596662efb002334de08c2a0a74971e4da33f467e84fac08886a
+    # via
+    #   -r requirements-dev.txt
+    #   flax
+optree==0.13.1 \
+    --hash=sha256:01819c3df950696f32c91faf8d376ae6b695ffdba18f330f1cab6b8e314e4612 \
+    --hash=sha256:025d23400b8b579462a251420f0a9ae77d3d3593f84276f3465985731d79d722 \
+    --hash=sha256:04252b5f24e5dae716647848b302f5f7849ecb028f8c617666d1b89a42eb988b \
+    --hash=sha256:0914ba436d6c0781dc9b04e3b95e06fe5c4fc6a87e94893da971805a3790efe8 \
+    --hash=sha256:0adc896018f34b5f37f6c92c35ae639877578725c5281cc9d4a0ac2ab2c46f77 \
+    --hash=sha256:0aec6da79a6130b4c76073241c0f31c11b96a38e70c7a00f9ed918d7464394ab \
+    --hash=sha256:0f1bde49e41a158af28d99fae1bd425fbd664907c53cf595106fb5b35e5cbe26 \
+    --hash=sha256:0f9707547635cfede8d79e4161c066021ffefc401d98bbf8eba452b1355a42c7 \
+    --hash=sha256:100d70cc57af5284649f881e6b266fee3a3e86e82024484eaa64ee18d1587e42 \
+    --hash=sha256:111172446e8a4f0d3be13a853fa28cb46b5679a1c7ca15b2e6db2b43dbbf9efb \
+    --hash=sha256:135e29e0a69149958003443d43f49af0ebb65f03ae52cddf4142e94d5a36b0c8 \
+    --hash=sha256:1496f29d5b9633fed4b3f1fd4b7e772d77200eb2370c08ef8e14404309c669b9 \
+    --hash=sha256:1891267f9dc76e9ddfed947ff7b755ad438ad483de0537a6b5bcf38478d5a33c \
+    --hash=sha256:1935639dd498a42367633e3877797e1330e39d44d48bbca1a136bb4dbe4c1bc9 \
+    --hash=sha256:1b291aed475ca5992a0c587ca4b72f074724209e01afca9d015c9a5b2089c68d \
+    --hash=sha256:1d74ff3dfe8599935d52b26a2fe5a43242b4d3f47be6fc1c5ce34c25e116d616 \
+    --hash=sha256:2063234ef4d58f11277e157d1cf066a8bd07be911da226bff84fc9761b8c1a25 \
+    --hash=sha256:22ce30c9d733c2214fa321c8370e4dfc8c7829970364618b2b5cacffbc9e8949 \
+    --hash=sha256:2521840d6aded4dac62c787f50bcb1cacbfcda86b9319d666b4025fa0ba5545a \
+    --hash=sha256:27d81dc43b522ba47ba7d2e7d91dbb486940348b1bf85caeb0afc2815c0aa492 \
+    --hash=sha256:28f083ede9be89503357a6b9e5d304826701596abe13d33e8f6fa2cd85b407fc \
+    --hash=sha256:2909cb42add6bb1a5a2b0243bdd8c4b861bf072f3741e26239481907ac8ad4e6 \
+    --hash=sha256:2cba7ca4cf991270a9fdd080b091d2cbdbcbf27858acebda6af40ff57312d1ea \
+    --hash=sha256:3010ae24e994f6e00071098d34e98e78eb995b7454a2ef629a0bf7df17441b24 \
+    --hash=sha256:30b02951c48ecca6fbeb6a3cc7a858267c4d82d1c874481a639061e845168da5 \
+    --hash=sha256:34b4dd0f5d73170c7740726cadfca973220ccbed9559beb51fab446d9e584d0a \
+    --hash=sha256:360f2e8f7eb22ff131bc7e3e241035908e6b47d41372eb3d68d77bc7036ddb30 \
+    --hash=sha256:363939b255a9fa0e077d8297a8301857c859592fc581cee19ec9238e0c145c4a \
+    --hash=sha256:37948e2d796db23d6ccd07105b709b827eba26549d34dd2149e95887c89fe9b4 \
+    --hash=sha256:395ac2eb69528613fd0f2ee8706890b7921b8ff3159df53b6e9f67eaf519c5cb \
+    --hash=sha256:3d0161012d80e4865017e10298ac55652cc3ad9a3eae9440229d4bf00b140e01 \
+    --hash=sha256:3da76fc43dcc22fe58d11634a04672ca7cc270aed469ac35fd5c78b7b9bc9125 \
+    --hash=sha256:4711f5cac5a2a49c3d6c9f0eca7b77c22b452170bb33ea01c3214ebb17931db9 \
+    --hash=sha256:48c29d9c6c64c8dc48c8ee97f7c1d5cdb83e37320f0be0857c06ce4b97994aea \
+    --hash=sha256:50dd6a9c8ccef267ab4941f07eac53faf6a00666dce4d209da20525570ffaca3 \
+    --hash=sha256:536ecf0e555432cc939d958590e33e00e75cc254ab0dd269e84fc9de8352db61 \
+    --hash=sha256:5569b95e214d20a1b7acb7d9477fabbd709d334bc34f3257368ea1418b811a44 \
+    --hash=sha256:55e82426bef151149cfa41d68ac957730fcd420996c0db8324fca81aa6a810ba \
+    --hash=sha256:587fb8de8e75e80fe7c7240e269630876bec3ee2038724893370976207813e4b \
+    --hash=sha256:5b5626c38d4a18a144063db5c1dbb558431d83ca10682324f74665a12214801f \
+    --hash=sha256:5b6531cd4eb23fadbbf77faf834e1119da06d7af3154f55786b59953cd87bb8a \
+    --hash=sha256:5c6aed6c5eabda59a91376aca08ba508a06f1c68850216a98743b5f8f55af841 \
+    --hash=sha256:5c950c85561c47efb3b1a3771ed1b2b2339bd5e28a0ca42bdcedadccc645eeac \
+    --hash=sha256:5d21a8b449e47fdbf118ac1938cf6f97d8a60258bc45c6eba3e61f79feeb1ea8 \
+    --hash=sha256:5da0fd26325a07354915cc4e3a9aee797cb75dff07c60d24b3f309457069abd3 \
+    --hash=sha256:5dec0785bc4bbcabecd7e82be3f189b21f3ce8a1244b243009736912a6d8f737 \
+    --hash=sha256:5f94a627c5a2fb776bbfa8f7558db5b918916d37586ba943e74e5f22789c4301 \
+    --hash=sha256:63b2749504fe0b9ac3892e26bf55a040ae2973bcf8da1476afe9266a4624be9d \
+    --hash=sha256:64032b77420410c3d315a4b9bcbece15853432c155613bb4261d87809b3ee357 \
+    --hash=sha256:652287e43fcbb29b8d1821144987e3bc558be4e5eec0d42fce7007cc3ee8e574 \
+    --hash=sha256:6bc9aae5ee17a38e3657c8c5db1a60923cc10debd177f6781f352362a846feeb \
+    --hash=sha256:6c4ab1d391b89cb88eb3c63383d5eb0930bc21141de9d5acd277feed9e38eb65 \
+    --hash=sha256:7abf1c6fe42cb112f0fb169f80d7b26476fa44226d2caf3727b49d210bdc3343 \
+    --hash=sha256:7e1c1da6574d59073b6a6b9a13633217f584ec271ddee4e014c7e422f171e9b4 \
+    --hash=sha256:84a6a974aa9dc4119fe502865c8e1755090ac17dbb53a964619a8ece1130831e \
+    --hash=sha256:8d89891e11a55ad83ab3e2810f8571774b2117a6198b4044fa44e0f37f72855e \
+    --hash=sha256:940c739c9957404a9bbe40ed9289792adaf476cece59eca4fe2f32137fa15a8d \
+    --hash=sha256:95298846c057cce2e7d114c03c645e86a5381b72388c8c390986bdefe69a759c \
+    --hash=sha256:9824a4258b058282eeaee1b388c8dfc704e49beda957b99177db8bd8249a3abe \
+    --hash=sha256:9c8ee1e988c634a451146b87d9ebdbf650a75dc1f52a9cffcd89fabb7289321c \
+    --hash=sha256:a3058e2d6a6a7d6362d40f7826258204d9fc2cc4cc8f72eaa3dbff14b6622025 \
+    --hash=sha256:a408a43f16840475612c7058eb80b53791bf8b8266c5b3cd07f69697958fd97d \
+    --hash=sha256:aee696272eece657c2b9e3cf079d8fc7cbbcc8a5c8199dbcd0960ddf7e672fe9 \
+    --hash=sha256:af67856aa8073d237fe67313d84f8aeafac32c1cef7239c628a2768d02679c43 \
+    --hash=sha256:b21ac55473476007e317500fd5851d0a0d695a0c51742bd65fe7347d18530da2 \
+    --hash=sha256:b5e5f09c85ae558a6bdaea57e63168082e728e777391393e9e2792f0d15b7b59 \
+    --hash=sha256:b94f9081cd810a59faae4dbac8f0447e59ce0fb2d70cfb388dc123c33a9fd1a8 \
+    --hash=sha256:bbc5fa2ff5090389f3a906567446f01d692bd6fe5cfcc5ae2d5861f24e8e0e4d \
+    --hash=sha256:bc9c396f64f9aacdf852713bd75f1b9a83f118660fd82e87c937c081b7ddccd1 \
+    --hash=sha256:c4d13f55dbd509d27be3af54d53b4ca0751bc518244ced6d0567e518e51452a2 \
+    --hash=sha256:c84ecb6977ba7f5d4ba24d0312cbffb74c6860237572701c2716bd811ca9b226 \
+    --hash=sha256:c99891c2ea6050738f7e3de5ab4038736cf33555a752b34a06922ebc9bf0488e \
+    --hash=sha256:ce962f0dd387137817dcda600bd6cf2e1b65103411807b6cdbbd9ffddf1061f6 \
+    --hash=sha256:cf85ba1a7d80b6dc19ef5ca4c17d2ff0290dc9306c5b8b468d51cede287f3c8d \
+    --hash=sha256:cfdf7f5cfb5f9b1c0188c667a3dc56551e60a52a918cb8600f84e2f0ad882106 \
+    --hash=sha256:d0c5a389c108367007151bcfef494f8c2674e4aa23d80ac9163876f5b213dfb6 \
+    --hash=sha256:d1844b966bb5c95b64af5c6f92f99e4037452b92b18d060fbd80097b5b773d86 \
+    --hash=sha256:d580f1bf23bb352c4db6b3544f282f1ac08dcb0d9ab537d25e56220353438cf7 \
+    --hash=sha256:d866f707b9f3a9f0e670a73fe8feee4993b2dbdbf9eef598e1cf2e5cb2876413 \
+    --hash=sha256:de1ae16ea0410497e50fe2b4d48a83c37bfc87da76e1e82f9cc8c800b4fc8be6 \
+    --hash=sha256:e40f018f522fcfd244688d1b3a360518e636ba7f636385aae0566eae3e7d29bc \
+    --hash=sha256:efbffeec15e4a79ed9921dc2227cbba1b64db353c4b72ce4ce83e62fbce9e652 \
+    --hash=sha256:f2a9eadcab78ccc04114a6916e9decdbc886bbe04c1b7a7bb32e723209162998 \
+    --hash=sha256:f39c7174a3f3cdc3f5fe6fb4b832f608c40ac174d7567ed6734b2ee952094631 \
+    --hash=sha256:f74fb880472572d550d85d2f1563365b6f194e2157a7703790cbd54d9ab5cf29 \
+    --hash=sha256:f788b2ad120deb73b4908a74473cd6de79cfb9f33bbe9dcb59cea2e2477d4e28 \
+    --hash=sha256:f8e2a546cecc5077ec7d4fe24ec8aede43ca8555b832d115f1ebbb4f3b35bc78 \
+    --hash=sha256:fafeda2e35e3270532132e27b471ea3e3aeac18f7966a4d0469137d1f36046ec
+    # via keras
+orbax-checkpoint==0.11.0 \
+    --hash=sha256:892a124fce71f3e7c71451a2b2090c0251db1097803a119a00baa377113bc9ba \
+    --hash=sha256:d4a0dcc81edd29191cf5a4feb9cf2a4edd31fc5da79d7be616a04f11f2a4d484
+    # via flax
+packaging==24.2 \
+    --hash=sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759 \
+    --hash=sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   keras
+    #   matplotlib
+    #   statsmodels
+    #   tensorboard
+    #   tensorflow-cpu
+    #   transformers
+pandas==2.2.3 \
+    --hash=sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a \
+    --hash=sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d \
+    --hash=sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5 \
+    --hash=sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4 \
+    --hash=sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0 \
+    --hash=sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32 \
+    --hash=sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea \
+    --hash=sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28 \
+    --hash=sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f \
+    --hash=sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348 \
+    --hash=sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18 \
+    --hash=sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468 \
+    --hash=sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5 \
+    --hash=sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e \
+    --hash=sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667 \
+    --hash=sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645 \
+    --hash=sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13 \
+    --hash=sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30 \
+    --hash=sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3 \
+    --hash=sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d \
+    --hash=sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb \
+    --hash=sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3 \
+    --hash=sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039 \
+    --hash=sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8 \
+    --hash=sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd \
+    --hash=sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761 \
+    --hash=sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659 \
+    --hash=sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57 \
+    --hash=sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c \
+    --hash=sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c \
+    --hash=sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4 \
+    --hash=sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a \
+    --hash=sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9 \
+    --hash=sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42 \
+    --hash=sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2 \
+    --hash=sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39 \
+    --hash=sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc \
+    --hash=sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698 \
+    --hash=sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed \
+    --hash=sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015 \
+    --hash=sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24 \
+    --hash=sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319
+    # via
+    #   -r requirements-dev.txt
+    #   datasets
+    #   mizani
+    #   plotnine
+    #   statsmodels
+patsy==1.0.1 \
+    --hash=sha256:751fb38f9e97e62312e921a1954b81e1bb2bcda4f5eeabaf94db251ee791509c \
+    --hash=sha256:e786a9391eec818c054e359b737bbce692f051aee4c661f4141cc88fb459c0c4
+    # via statsmodels
+pillow==11.1.0 \
+    --hash=sha256:015c6e863faa4779251436db398ae75051469f7c903b043a48f078e437656f83 \
+    --hash=sha256:0a2f91f8a8b367e7a57c6e91cd25af510168091fb89ec5146003e424e1558a96 \
+    --hash=sha256:11633d58b6ee5733bde153a8dafd25e505ea3d32e261accd388827ee987baf65 \
+    --hash=sha256:2062ffb1d36544d42fcaa277b069c88b01bb7298f4efa06731a7fd6cc290b81a \
+    --hash=sha256:31eba6bbdd27dde97b0174ddf0297d7a9c3a507a8a1480e1e60ef914fe23d352 \
+    --hash=sha256:3362c6ca227e65c54bf71a5f88b3d4565ff1bcbc63ae72c34b07bbb1cc59a43f \
+    --hash=sha256:368da70808b36d73b4b390a8ffac11069f8a5c85f29eff1f1b01bcf3ef5b2a20 \
+    --hash=sha256:36ba10b9cb413e7c7dfa3e189aba252deee0602c86c309799da5a74009ac7a1c \
+    --hash=sha256:3764d53e09cdedd91bee65c2527815d315c6b90d7b8b79759cc48d7bf5d4f114 \
+    --hash=sha256:3a5fe20a7b66e8135d7fd617b13272626a28278d0e578c98720d9ba4b2439d49 \
+    --hash=sha256:3cdcdb0b896e981678eee140d882b70092dac83ac1cdf6b3a60e2216a73f2b91 \
+    --hash=sha256:4637b88343166249fe8aa94e7c4a62a180c4b3898283bb5d3d2fd5fe10d8e4e0 \
+    --hash=sha256:4db853948ce4e718f2fc775b75c37ba2efb6aaea41a1a5fc57f0af59eee774b2 \
+    --hash=sha256:4dd43a78897793f60766563969442020e90eb7847463eca901e41ba186a7d4a5 \
+    --hash=sha256:54251ef02a2309b5eec99d151ebf5c9904b77976c8abdcbce7891ed22df53884 \
+    --hash=sha256:54ce1c9a16a9561b6d6d8cb30089ab1e5eb66918cb47d457bd996ef34182922e \
+    --hash=sha256:593c5fd6be85da83656b93ffcccc2312d2d149d251e98588b14fbc288fd8909c \
+    --hash=sha256:5bb94705aea800051a743aa4874bb1397d4695fb0583ba5e425ee0328757f196 \
+    --hash=sha256:67cd427c68926108778a9005f2a04adbd5e67c442ed21d95389fe1d595458756 \
+    --hash=sha256:70ca5ef3b3b1c4a0812b5c63c57c23b63e53bc38e758b37a951e5bc466449861 \
+    --hash=sha256:73ddde795ee9b06257dac5ad42fcb07f3b9b813f8c1f7f870f402f4dc54b5269 \
+    --hash=sha256:758e9d4ef15d3560214cddbc97b8ef3ef86ce04d62ddac17ad39ba87e89bd3b1 \
+    --hash=sha256:7d33d2fae0e8b170b6a6c57400e077412240f6f5bb2a342cf1ee512a787942bb \
+    --hash=sha256:7fdadc077553621911f27ce206ffcbec7d3f8d7b50e0da39f10997e8e2bb7f6a \
+    --hash=sha256:8000376f139d4d38d6851eb149b321a52bb8893a88dae8ee7d95840431977081 \
+    --hash=sha256:837060a8599b8f5d402e97197d4924f05a2e0d68756998345c829c33186217b1 \
+    --hash=sha256:89dbdb3e6e9594d512780a5a1c42801879628b38e3efc7038094430844e271d8 \
+    --hash=sha256:8c730dc3a83e5ac137fbc92dfcfe1511ce3b2b5d7578315b63dbbb76f7f51d90 \
+    --hash=sha256:8e275ee4cb11c262bd108ab2081f750db2a1c0b8c12c1897f27b160c8bd57bbc \
+    --hash=sha256:9044b5e4f7083f209c4e35aa5dd54b1dd5b112b108648f5c902ad586d4f945c5 \
+    --hash=sha256:93a18841d09bcdd774dcdc308e4537e1f867b3dec059c131fde0327899734aa1 \
+    --hash=sha256:9409c080586d1f683df3f184f20e36fb647f2e0bc3988094d4fd8c9f4eb1b3b3 \
+    --hash=sha256:96f82000e12f23e4f29346e42702b6ed9a2f2fea34a740dd5ffffcc8c539eb35 \
+    --hash=sha256:9aa9aeddeed452b2f616ff5507459e7bab436916ccb10961c4a382cd3e03f47f \
+    --hash=sha256:9ee85f0696a17dd28fbcfceb59f9510aa71934b483d1f5601d1030c3c8304f3c \
+    --hash=sha256:a07dba04c5e22824816b2615ad7a7484432d7f540e6fa86af60d2de57b0fcee2 \
+    --hash=sha256:a3cd561ded2cf2bbae44d4605837221b987c216cff94f49dfeed63488bb228d2 \
+    --hash=sha256:a697cd8ba0383bba3d2d3ada02b34ed268cb548b369943cd349007730c92bddf \
+    --hash=sha256:a76da0a31da6fcae4210aa94fd779c65c75786bc9af06289cd1c184451ef7a65 \
+    --hash=sha256:a85b653980faad27e88b141348707ceeef8a1186f75ecc600c395dcac19f385b \
+    --hash=sha256:a8d65b38173085f24bc07f8b6c505cbb7418009fa1a1fcb111b1f4961814a442 \
+    --hash=sha256:aa8dd43daa836b9a8128dbe7d923423e5ad86f50a7a14dc688194b7be5c0dea2 \
+    --hash=sha256:ab8a209b8485d3db694fa97a896d96dd6533d63c22829043fd9de627060beade \
+    --hash=sha256:abc56501c3fd148d60659aae0af6ddc149660469082859fa7b066a298bde9482 \
+    --hash=sha256:ad5db5781c774ab9a9b2c4302bbf0c1014960a0a7be63278d13ae6fdf88126fe \
+    --hash=sha256:ae98e14432d458fc3de11a77ccb3ae65ddce70f730e7c76140653048c71bfcbc \
+    --hash=sha256:b20be51b37a75cc54c2c55def3fa2c65bb94ba859dde241cd0a4fd302de5ae0a \
+    --hash=sha256:b523466b1a31d0dcef7c5be1f20b942919b62fd6e9a9be199d035509cbefc0ec \
+    --hash=sha256:b5d658fbd9f0d6eea113aea286b21d3cd4d3fd978157cbf2447a6035916506d3 \
+    --hash=sha256:b6123aa4a59d75f06e9dd3dac5bf8bc9aa383121bb3dd9a7a612e05eabc9961a \
+    --hash=sha256:bd165131fd51697e22421d0e467997ad31621b74bfc0b75956608cb2906dda07 \
+    --hash=sha256:bf902d7413c82a1bfa08b06a070876132a5ae6b2388e2712aab3a7cbc02205c6 \
+    --hash=sha256:c12fc111ef090845de2bb15009372175d76ac99969bdf31e2ce9b42e4b8cd88f \
+    --hash=sha256:c1eec9d950b6fe688edee07138993e54ee4ae634c51443cfb7c1e7613322718e \
+    --hash=sha256:c640e5a06869c75994624551f45e5506e4256562ead981cce820d5ab39ae2192 \
+    --hash=sha256:cc1331b6d5a6e144aeb5e626f4375f5b7ae9934ba620c0ac6b3e43d5e683a0f0 \
+    --hash=sha256:cfd5cd998c2e36a862d0e27b2df63237e67273f2fc78f47445b14e73a810e7e6 \
+    --hash=sha256:d3d8da4a631471dfaf94c10c85f5277b1f8e42ac42bade1ac67da4b4a7359b73 \
+    --hash=sha256:d44ff19eea13ae4acdaaab0179fa68c0c6f2f45d66a4d8ec1eda7d6cecbcc15f \
+    --hash=sha256:dd0052e9db3474df30433f83a71b9b23bd9e4ef1de13d92df21a52c0303b8ab6 \
+    --hash=sha256:dd0e081319328928531df7a0e63621caf67652c8464303fd102141b785ef9547 \
+    --hash=sha256:dda60aa465b861324e65a78c9f5cf0f4bc713e4309f83bc387be158b077963d9 \
+    --hash=sha256:e06695e0326d05b06833b40b7ef477e475d0b1ba3a6d27da1bb48c23209bf457 \
+    --hash=sha256:e1abe69aca89514737465752b4bcaf8016de61b3be1397a8fc260ba33321b3a8 \
+    --hash=sha256:e267b0ed063341f3e60acd25c05200df4193e15a4a5807075cd71225a2386e26 \
+    --hash=sha256:e5449ca63da169a2e6068dd0e2fcc8d91f9558aba89ff6d02121ca8ab11e79e5 \
+    --hash=sha256:e63e4e5081de46517099dc30abe418122f54531a6ae2ebc8680bcd7096860eab \
+    --hash=sha256:f189805c8be5ca5add39e6f899e6ce2ed824e65fb45f3c28cb2841911da19070 \
+    --hash=sha256:f7955ecf5609dee9442cbface754f2c6e541d9e6eda87fad7f7a989b0bdb9d71 \
+    --hash=sha256:f86d3a7a9af5d826744fabf4afd15b9dfef44fe69a98541f666f66fbb8d3fef9 \
+    --hash=sha256:fbd43429d0d7ed6533b25fc993861b8fd512c42d04514a0dd6337fb3ccf22761
+    # via
+    #   matplotlib
+    #   torchvision
+plotnine==0.14.5 \
+    --hash=sha256:4a8bc4360732dd69a0263def4abab285ed8f0f4386186f1e44c642f2cea79b88 \
+    --hash=sha256:9e75969e8e10d8d770a4be36d10e075cc10b88ca6fcc99e36ada53436fb5653f
+    # via -r requirements-dev.txt
+promise==2.3 \
+    --hash=sha256:dfd18337c523ba4b6a58801c164c1904a9d4d1b1747c7d5dbf45b693a49d93d0
+    # via tensorflow-datasets
+propcache==0.2.1 \
+    --hash=sha256:03ff9d3f665769b2a85e6157ac8b439644f2d7fd17615a82fa55739bc97863f4 \
+    --hash=sha256:049324ee97bb67285b49632132db351b41e77833678432be52bdd0289c0e05e4 \
+    --hash=sha256:081a430aa8d5e8876c6909b67bd2d937bfd531b0382d3fdedb82612c618bc41a \
+    --hash=sha256:0f022d381747f0dfe27e99d928e31bc51a18b65bb9e481ae0af1380a6725dd1f \
+    --hash=sha256:12d1083f001ace206fe34b6bdc2cb94be66d57a850866f0b908972f90996b3e9 \
+    --hash=sha256:14d86fe14b7e04fa306e0c43cdbeebe6b2c2156a0c9ce56b815faacc193e320d \
+    --hash=sha256:160291c60081f23ee43d44b08a7e5fb76681221a8e10b3139618c5a9a291b84e \
+    --hash=sha256:1672137af7c46662a1c2be1e8dc78cb6d224319aaa40271c9257d886be4363a6 \
+    --hash=sha256:19a0f89a7bb9d8048d9c4370c9c543c396e894c76be5525f5e1ad287f1750ddf \
+    --hash=sha256:1ac2f5fe02fa75f56e1ad473f1175e11f475606ec9bd0be2e78e4734ad575034 \
+    --hash=sha256:1cd9a1d071158de1cc1c71a26014dcdfa7dd3d5f4f88c298c7f90ad6f27bb46d \
+    --hash=sha256:1ffc3cca89bb438fb9c95c13fc874012f7b9466b89328c3c8b1aa93cdcfadd16 \
+    --hash=sha256:297878dc9d0a334358f9b608b56d02e72899f3b8499fc6044133f0d319e2ec30 \
+    --hash=sha256:2d3af2e79991102678f53e0dbf4c35de99b6b8b58f29a27ca0325816364caaba \
+    --hash=sha256:30b43e74f1359353341a7adb783c8f1b1c676367b011709f466f42fda2045e95 \
+    --hash=sha256:3156628250f46a0895f1f36e1d4fbe062a1af8718ec3ebeb746f1d23f0c5dc4d \
+    --hash=sha256:31f5af773530fd3c658b32b6bdc2d0838543de70eb9a2156c03e410f7b0d3aae \
+    --hash=sha256:3935bfa5fede35fb202c4b569bb9c042f337ca4ff7bd540a0aa5e37131659348 \
+    --hash=sha256:39d51fbe4285d5db5d92a929e3e21536ea3dd43732c5b177c7ef03f918dff9f2 \
+    --hash=sha256:3f77ce728b19cb537714499928fe800c3dda29e8d9428778fc7c186da4c09a64 \
+    --hash=sha256:4160d9283bd382fa6c0c2b5e017acc95bc183570cd70968b9202ad6d8fc48dce \
+    --hash=sha256:4a571d97dbe66ef38e472703067021b1467025ec85707d57e78711c085984e54 \
+    --hash=sha256:4e6281aedfca15301c41f74d7005e6e3f4ca143584ba696ac69df4f02f40d629 \
+    --hash=sha256:52277518d6aae65536e9cea52d4e7fd2f7a66f4aa2d30ed3f2fcea620ace3c54 \
+    --hash=sha256:556fc6c10989f19a179e4321e5d678db8eb2924131e64652a51fe83e4c3db0e1 \
+    --hash=sha256:574faa3b79e8ebac7cb1d7930f51184ba1ccf69adfdec53a12f319a06030a68b \
+    --hash=sha256:58791550b27d5488b1bb52bc96328456095d96206a250d28d874fafe11b3dfaf \
+    --hash=sha256:5b750a8e5a1262434fb1517ddf64b5de58327f1adc3524a5e44c2ca43305eb0b \
+    --hash=sha256:5d97151bc92d2b2578ff7ce779cdb9174337390a535953cbb9452fb65164c587 \
+    --hash=sha256:5eee736daafa7af6d0a2dc15cc75e05c64f37fc37bafef2e00d77c14171c2097 \
+    --hash=sha256:6445804cf4ec763dc70de65a3b0d9954e868609e83850a47ca4f0cb64bd79fea \
+    --hash=sha256:647894f5ae99c4cf6bb82a1bb3a796f6e06af3caa3d32e26d2350d0e3e3faf24 \
+    --hash=sha256:66d4cfda1d8ed687daa4bc0274fcfd5267873db9a5bc0418c2da19273040eeb7 \
+    --hash=sha256:6a9a8c34fb7bb609419a211e59da8887eeca40d300b5ea8e56af98f6fbbb1541 \
+    --hash=sha256:6b3f39a85d671436ee3d12c017f8fdea38509e4f25b28eb25877293c98c243f6 \
+    --hash=sha256:6b6fb63ae352e13748289f04f37868099e69dba4c2b3e271c46061e82c745634 \
+    --hash=sha256:70693319e0b8fd35dd863e3e29513875eb15c51945bf32519ef52927ca883bc3 \
+    --hash=sha256:781e65134efaf88feb447e8c97a51772aa75e48b794352f94cb7ea717dedda0d \
+    --hash=sha256:819ce3b883b7576ca28da3861c7e1a88afd08cc8c96908e08a3f4dd64a228034 \
+    --hash=sha256:857112b22acd417c40fa4595db2fe28ab900c8c5fe4670c7989b1c0230955465 \
+    --hash=sha256:887d9b0a65404929641a9fabb6452b07fe4572b269d901d622d8a34a4e9043b2 \
+    --hash=sha256:8b3489ff1ed1e8315674d0775dc7d2195fb13ca17b3808721b54dbe9fd020faf \
+    --hash=sha256:92fc4500fcb33899b05ba73276dfb684a20d31caa567b7cb5252d48f896a91b1 \
+    --hash=sha256:9403db39be1393618dd80c746cb22ccda168efce239c73af13c3763ef56ffc04 \
+    --hash=sha256:98110aa363f1bb4c073e8dcfaefd3a5cea0f0834c2aab23dda657e4dab2f53b5 \
+    --hash=sha256:999779addc413181912e984b942fbcc951be1f5b3663cd80b2687758f434c583 \
+    --hash=sha256:9caac6b54914bdf41bcc91e7eb9147d331d29235a7c967c150ef5df6464fd1bb \
+    --hash=sha256:a7a078f5d37bee6690959c813977da5291b24286e7b962e62a94cec31aa5188b \
+    --hash=sha256:a7e65eb5c003a303b94aa2c3852ef130230ec79e349632d030e9571b87c4698c \
+    --hash=sha256:a96dc1fa45bd8c407a0af03b2d5218392729e1822b0c32e62c5bf7eeb5fb3958 \
+    --hash=sha256:aca405706e0b0a44cc6bfd41fbe89919a6a56999157f6de7e182a990c36e37bc \
+    --hash=sha256:accb6150ce61c9c4b7738d45550806aa2b71c7668c6942f17b0ac182b6142fd4 \
+    --hash=sha256:ad1af54a62ffe39cf34db1aa6ed1a1873bd548f6401db39d8e7cd060b9211f82 \
+    --hash=sha256:ae1aa1cd222c6d205853b3013c69cd04515f9d6ab6de4b0603e2e1c33221303e \
+    --hash=sha256:b2d0a12018b04f4cb820781ec0dffb5f7c7c1d2a5cd22bff7fb055a2cb19ebce \
+    --hash=sha256:b480c6a4e1138e1aa137c0079b9b6305ec6dcc1098a8ca5196283e8a49df95a9 \
+    --hash=sha256:b74c261802d3d2b85c9df2dfb2fa81b6f90deeef63c2db9f0e029a3cac50b518 \
+    --hash=sha256:ba278acf14471d36316159c94a802933d10b6a1e117b8554fe0d0d9b75c9d536 \
+    --hash=sha256:bb6178c241278d5fe853b3de743087be7f5f4c6f7d6d22a3b524d323eecec505 \
+    --hash=sha256:bf72af5e0fb40e9babf594308911436c8efde3cb5e75b6f206c34ad18be5c052 \
+    --hash=sha256:bfd3223c15bebe26518d58ccf9a39b93948d3dcb3e57a20480dfdd315356baff \
+    --hash=sha256:c214999039d4f2a5b2073ac506bba279945233da8c786e490d411dfc30f855c1 \
+    --hash=sha256:c2f992c07c0fca81655066705beae35fc95a2fa7366467366db627d9f2ee097f \
+    --hash=sha256:cba4cfa1052819d16699e1d55d18c92b6e094d4517c41dd231a8b9f87b6fa681 \
+    --hash=sha256:cea7daf9fc7ae6687cf1e2c049752f19f146fdc37c2cc376e7d0032cf4f25347 \
+    --hash=sha256:cf6c4150f8c0e32d241436526f3c3f9cbd34429492abddbada2ffcff506c51af \
+    --hash=sha256:d09c333d36c1409d56a9d29b3a1b800a42c76a57a5a8907eacdbce3f18768246 \
+    --hash=sha256:d27b84d5880f6d8aa9ae3edb253c59d9f6642ffbb2c889b78b60361eed449787 \
+    --hash=sha256:d2ccec9ac47cf4e04897619c0e0c1a48c54a71bdf045117d3a26f80d38ab1fb0 \
+    --hash=sha256:d71264a80f3fcf512eb4f18f59423fe82d6e346ee97b90625f283df56aee103f \
+    --hash=sha256:d93f3307ad32a27bda2e88ec81134b823c240aa3abb55821a8da553eed8d9439 \
+    --hash=sha256:d9631c5e8b5b3a0fda99cb0d29c18133bca1e18aea9effe55adb3da1adef80d3 \
+    --hash=sha256:ddfab44e4489bd79bda09d84c430677fc7f0a4939a73d2bba3073036f487a0a6 \
+    --hash=sha256:e7048abd75fe40712005bcfc06bb44b9dfcd8e101dda2ecf2f5aa46115ad07ca \
+    --hash=sha256:e73091191e4280403bde6c9a52a6999d69cdfde498f1fdf629105247599b57ec \
+    --hash=sha256:e800776a79a5aabdb17dcc2346a7d66d0777e942e4cd251defeb084762ecd17d \
+    --hash=sha256:edc9fc7051e3350643ad929df55c451899bb9ae6d24998a949d2e4c87fb596d3 \
+    --hash=sha256:f089118d584e859c62b3da0892b88a83d611c2033ac410e929cb6754eec0ed16 \
+    --hash=sha256:f174bbd484294ed9fdf09437f889f95807e5f229d5d93588d34e92106fbf6717 \
+    --hash=sha256:f508b0491767bb1f2b87fdfacaba5f7eddc2f867740ec69ece6d1946d29029a6 \
+    --hash=sha256:f7a31fc1e1bd362874863fdeed71aed92d348f5336fd84f2197ba40c59f061bd \
+    --hash=sha256:f9479aa06a793c5aeba49ce5c5692ffb51fcd9a7016e017d555d5e2b0045d212
+    # via
+    #   aiohttp
+    #   yarl
+protobuf==5.27.3 \
+    --hash=sha256:043853dcb55cc262bf2e116215ad43fa0859caab79bb0b2d31b708f128ece035 \
+    --hash=sha256:16ddf3f8c6c41e1e803da7abea17b1793a97ef079a912e42351eabb19b2cffe7 \
+    --hash=sha256:68248c60d53f6168f565a8c76dc58ba4fa2ade31c2d1ebdae6d80f969cdc2d4f \
+    --hash=sha256:82460903e640f2b7e34ee81a947fdaad89de796d324bcbc38ff5430bcdead82c \
+    --hash=sha256:8572c6533e544ebf6899c360e91d6bcbbee2549251643d32c52cf8a5de295ba5 \
+    --hash=sha256:a55c48f2a2092d8e213bd143474df33a6ae751b781dd1d1f4d953c128a415b25 \
+    --hash=sha256:af7c0b7cfbbb649ad26132e53faa348580f844d9ca46fd3ec7ca48a1ea5db8a1 \
+    --hash=sha256:b8a994fb3d1c11156e7d1e427186662b64694a62b55936b2b9348f0a7c6625ce \
+    --hash=sha256:c2a105c24f08b1e53d6c7ffe69cb09d0031512f0b72f812dd4005b8112dbe91e \
+    --hash=sha256:c84eee2c71ed83704f1afbf1a85c3171eab0fd1ade3b399b3fad0884cbcca8bf \
+    --hash=sha256:dcb307cd4ef8fec0cf52cb9105a03d06fbb5275ce6d84a6ae33bc6cf84e0a07b
+    # via
+    #   -r requirements-dev.txt
+    #   google-api-core
+    #   googleapis-common-protos
+    #   orbax-checkpoint
+    #   tensorboard
+    #   tensorflow-cpu
+    #   tensorflow-datasets
+    #   tensorflow-metadata
+psutil==6.1.1 \
+    --hash=sha256:018aeae2af92d943fdf1da6b58665124897cfc94faa2ca92098838f83e1b1bca \
+    --hash=sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377 \
+    --hash=sha256:1924e659d6c19c647e763e78670a05dbb7feaf44a0e9c94bf9e14dfc6ba50468 \
+    --hash=sha256:33431e84fee02bc84ea36d9e2c4a6d395d479c9dd9bba2376c1f6ee8f3a4e0b3 \
+    --hash=sha256:384636b1a64b47814437d1173be1427a7c83681b17a450bfc309a1953e329603 \
+    --hash=sha256:6d4281f5bbca041e2292be3380ec56a9413b790579b8e593b1784499d0005dac \
+    --hash=sha256:8be07491f6ebe1a693f17d4f11e69d0dc1811fa082736500f649f79df7735303 \
+    --hash=sha256:8df0178ba8a9e5bc84fed9cfa61d54601b371fbec5c8eebad27575f1e105c0d4 \
+    --hash=sha256:97f7cb9921fbec4904f522d972f0c0e1f4fabbdd4e0287813b21215074a0f160 \
+    --hash=sha256:9ccc4316f24409159897799b83004cb1e24f9819b0dcf9c0b68bdcb6cefee6a8 \
+    --hash=sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003 \
+    --hash=sha256:c777eb75bb33c47377c9af68f30e9f11bc78e0f07fbf907be4a5d70b2fe5f030 \
+    --hash=sha256:ca9609c77ea3b8481ab005da74ed894035936223422dc591d6772b147421f777 \
+    --hash=sha256:cf8496728c18f2d0b45198f06895be52f36611711746b7f30c464b422b50e2f5 \
+    --hash=sha256:eaa912e0b11848c4d9279a93d7e2783df352b082f40111e078388701fd479e53 \
+    --hash=sha256:f35cfccb065fff93529d2afb4a2e89e363fe63ca1e4a5da22b603a85833c2649 \
+    --hash=sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8
+    # via tensorflow-datasets
+pyarrow==18.1.0 \
+    --hash=sha256:01c034b576ce0eef554f7c3d8c341714954be9b3f5d5bc7117006b85fcf302fe \
+    --hash=sha256:05a5636ec3eb5cc2a36c6edb534a38ef57b2ab127292a716d00eabb887835f1e \
+    --hash=sha256:0743e503c55be0fdb5c08e7d44853da27f19dc854531c0570f9f394ec9671d54 \
+    --hash=sha256:0ad4892617e1a6c7a551cfc827e072a633eaff758fa09f21c4ee548c30bcaf99 \
+    --hash=sha256:0b331e477e40f07238adc7ba7469c36b908f07c89b95dd4bd3a0ec84a3d1e21e \
+    --hash=sha256:11b676cd410cf162d3f6a70b43fb9e1e40affbc542a1e9ed3681895f2962d3d9 \
+    --hash=sha256:25dbacab8c5952df0ca6ca0af28f50d45bd31c1ff6fcf79e2d120b4a65ee7181 \
+    --hash=sha256:2c4dd0c9010a25ba03e198fe743b1cc03cd33c08190afff371749c52ccbbaf76 \
+    --hash=sha256:36ac22d7782554754a3b50201b607d553a8d71b78cdf03b33c1125be4b52397c \
+    --hash=sha256:3b2e2239339c538f3464308fd345113f886ad031ef8266c6f004d49769bb074c \
+    --hash=sha256:3c35813c11a059056a22a3bef520461310f2f7eea5c8a11ef9de7062a23f8d56 \
+    --hash=sha256:4a4813cb8ecf1809871fd2d64a8eff740a1bd3691bbe55f01a3cf6c5ec869754 \
+    --hash=sha256:4f443122c8e31f4c9199cb23dca29ab9427cef990f283f80fe15b8e124bcc49b \
+    --hash=sha256:4f97b31b4c4e21ff58c6f330235ff893cc81e23da081b1a4b1c982075e0ed4e9 \
+    --hash=sha256:543ad8459bc438efc46d29a759e1079436290bd583141384c6f7a1068ed6f992 \
+    --hash=sha256:6a276190309aba7bc9d5bd2933230458b3521a4317acfefe69a354f2fe59f2bc \
+    --hash=sha256:73eeed32e724ea3568bb06161cad5fa7751e45bc2228e33dcb10c614044165c7 \
+    --hash=sha256:74de649d1d2ccb778f7c3afff6085bd5092aed4c23df9feeb45dd6b16f3811aa \
+    --hash=sha256:84e314d22231357d473eabec709d0ba285fa706a72377f9cc8e1cb3c8013813b \
+    --hash=sha256:9386d3ca9c145b5539a1cfc75df07757dff870168c959b473a0bccbc3abc8c73 \
+    --hash=sha256:9736ba3c85129d72aefa21b4f3bd715bc4190fe4426715abfff90481e7d00812 \
+    --hash=sha256:9f3a76670b263dc41d0ae877f09124ab96ce10e4e48f3e3e4257273cee61ad0d \
+    --hash=sha256:a1880dd6772b685e803011a6b43a230c23b566859a6e0c9a276c1e0faf4f4052 \
+    --hash=sha256:acb7564204d3c40babf93a05624fc6a8ec1ab1def295c363afc40b0c9e66c191 \
+    --hash=sha256:ad514dbfcffe30124ce655d72771ae070f30bf850b48bc4d9d3b25993ee0e386 \
+    --hash=sha256:aebc13a11ed3032d8dd6e7171eb6e86d40d67a5639d96c35142bd568b9299324 \
+    --hash=sha256:b516dad76f258a702f7ca0250885fc93d1fa5ac13ad51258e39d402bd9e2e1e4 \
+    --hash=sha256:b76130d835261b38f14fc41fdfb39ad8d672afb84c447126b84d5472244cfaba \
+    --hash=sha256:ba17845efe3aa358ec266cf9cc2800fa73038211fb27968bfa88acd09261a470 \
+    --hash=sha256:c0a03da7f2758645d17b7b4f83c8bffeae5bbb7f974523fe901f36288d2eab71 \
+    --hash=sha256:c52f81aa6f6575058d8e2c782bf79d4f9fdc89887f16825ec3a66607a5dd8e30 \
+    --hash=sha256:d4b3d2a34780645bed6414e22dda55a92e0fcd1b8a637fba86800ad737057e33 \
+    --hash=sha256:d4f13eee18433f99adefaeb7e01d83b59f73360c231d4782d9ddfaf1c3fbde0a \
+    --hash=sha256:d6cf5c05f3cee251d80e98726b5c7cc9f21bab9e9783673bac58e6dfab57ecc8 \
+    --hash=sha256:da31fbca07c435be88a0c321402c4e31a2ba61593ec7473630769de8346b54ee \
+    --hash=sha256:e21488d5cfd3d8b500b3238a6c4b075efabc18f0f6d80b29239737ebd69caa6c \
+    --hash=sha256:e31e9417ba9c42627574bdbfeada7217ad8a4cbbe45b9d6bdd4b62abbca4c6f6 \
+    --hash=sha256:eaeabf638408de2772ce3d7793b2668d4bb93807deed1725413b70e3156a7854 \
+    --hash=sha256:f266a2c0fc31995a06ebd30bcfdb7f615d7278035ec5b1cd71c48d56daaf30b0 \
+    --hash=sha256:f39a2e0ed32a0970e4e46c262753417a60c43a3246972cfc2d3eb85aedd01b21 \
+    --hash=sha256:f591704ac05dfd0477bb8f8e0bd4b5dc52c1cadf50503858dce3a15db6e46ff2 \
+    --hash=sha256:f96bd502cb11abb08efea6dab09c003305161cb6c9eafd432e35e76e7fa9b90c
+    # via
+    #   datasets
+    #   tensorflow-datasets
+pyasn1==0.6.1 \
+    --hash=sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629 \
+    --hash=sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034
+    # via
+    #   oauth2client
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.1 \
+    --hash=sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd \
+    --hash=sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c
+    # via
+    #   google-auth
+    #   oauth2client
+pygments==2.19.0 \
+    --hash=sha256:4755e6e64d22161d5b61432c0600c923c5927214e7c956e31c23923c89251a9b \
+    --hash=sha256:afc4146269910d4bdfabcd27c24923137a74d562a23a320a41a55ad303e19783
+    # via rich
+pyparsing==3.2.1 \
+    --hash=sha256:506ff4f4386c4cec0590ec19e6302d3aedb992fdc02c761e90416f158dacf8e1 \
+    --hash=sha256:61980854fd66de3a90028d679a954d5f2623e83144b5afe5ee86f43d762e5f0a
+    # via
+    #   httplib2
+    #   matplotlib
+python-dateutil==2.9.0.post0 \
+    --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \
+    --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427
+    # via
+    #   matplotlib
+    #   pandas
+pytz==2024.2 \
+    --hash=sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a \
+    --hash=sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725
+    # via
+    #   google-api-core
+    #   pandas
+pyyaml==6.0.2 \
+    --hash=sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff \
+    --hash=sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48 \
+    --hash=sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086 \
+    --hash=sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e \
+    --hash=sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133 \
+    --hash=sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5 \
+    --hash=sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484 \
+    --hash=sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee \
+    --hash=sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5 \
+    --hash=sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68 \
+    --hash=sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a \
+    --hash=sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf \
+    --hash=sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99 \
+    --hash=sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8 \
+    --hash=sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85 \
+    --hash=sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19 \
+    --hash=sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc \
+    --hash=sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a \
+    --hash=sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1 \
+    --hash=sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317 \
+    --hash=sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c \
+    --hash=sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631 \
+    --hash=sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d \
+    --hash=sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652 \
+    --hash=sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 \
+    --hash=sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e \
+    --hash=sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b \
+    --hash=sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 \
+    --hash=sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 \
+    --hash=sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706 \
+    --hash=sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563 \
+    --hash=sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237 \
+    --hash=sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b \
+    --hash=sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083 \
+    --hash=sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180 \
+    --hash=sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425 \
+    --hash=sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e \
+    --hash=sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f \
+    --hash=sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725 \
+    --hash=sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183 \
+    --hash=sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab \
+    --hash=sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774 \
+    --hash=sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 \
+    --hash=sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e \
+    --hash=sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5 \
+    --hash=sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d \
+    --hash=sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290 \
+    --hash=sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44 \
+    --hash=sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed \
+    --hash=sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4 \
+    --hash=sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba \
+    --hash=sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12 \
+    --hash=sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4
+    # via
+    #   flax
+    #   huggingface-hub
+    #   orbax-checkpoint
+    #   torch-xla
+    #   transformers
+regex==2024.11.6 \
+    --hash=sha256:02a02d2bb04fec86ad61f3ea7f49c015a0681bf76abb9857f945d26159d2968c \
+    --hash=sha256:02e28184be537f0e75c1f9b2f8847dc51e08e6e171c6bde130b2687e0c33cf60 \
+    --hash=sha256:040df6fe1a5504eb0f04f048e6d09cd7c7110fef851d7c567a6b6e09942feb7d \
+    --hash=sha256:068376da5a7e4da51968ce4c122a7cd31afaaec4fccc7856c92f63876e57b51d \
+    --hash=sha256:06eb1be98df10e81ebaded73fcd51989dcf534e3c753466e4b60c4697a003b67 \
+    --hash=sha256:072623554418a9911446278f16ecb398fb3b540147a7828c06e2011fa531e773 \
+    --hash=sha256:086a27a0b4ca227941700e0b31425e7a28ef1ae8e5e05a33826e17e47fbfdba0 \
+    --hash=sha256:08986dce1339bc932923e7d1232ce9881499a0e02925f7402fb7c982515419ef \
+    --hash=sha256:0a86e7eeca091c09e021db8eb72d54751e527fa47b8d5787caf96d9831bd02ad \
+    --hash=sha256:0c32f75920cf99fe6b6c539c399a4a128452eaf1af27f39bce8909c9a3fd8cbe \
+    --hash=sha256:0d7f453dca13f40a02b79636a339c5b62b670141e63efd511d3f8f73fba162b3 \
+    --hash=sha256:1062b39a0a2b75a9c694f7a08e7183a80c63c0d62b301418ffd9c35f55aaa114 \
+    --hash=sha256:13291b39131e2d002a7940fb176e120bec5145f3aeb7621be6534e46251912c4 \
+    --hash=sha256:149f5008d286636e48cd0b1dd65018548944e495b0265b45e1bffecce1ef7f39 \
+    --hash=sha256:164d8b7b3b4bcb2068b97428060b2a53be050085ef94eca7f240e7947f1b080e \
+    --hash=sha256:167ed4852351d8a750da48712c3930b031f6efdaa0f22fa1933716bfcd6bf4a3 \
+    --hash=sha256:1c4de13f06a0d54fa0d5ab1b7138bfa0d883220965a29616e3ea61b35d5f5fc7 \
+    --hash=sha256:202eb32e89f60fc147a41e55cb086db2a3f8cb82f9a9a88440dcfc5d37faae8d \
+    --hash=sha256:220902c3c5cc6af55d4fe19ead504de80eb91f786dc102fbd74894b1551f095e \
+    --hash=sha256:2b3361af3198667e99927da8b84c1b010752fa4b1115ee30beaa332cabc3ef1a \
+    --hash=sha256:2c89a8cc122b25ce6945f0423dc1352cb9593c68abd19223eebbd4e56612c5b7 \
+    --hash=sha256:2d548dafee61f06ebdb584080621f3e0c23fff312f0de1afc776e2a2ba99a74f \
+    --hash=sha256:2e34b51b650b23ed3354b5a07aab37034d9f923db2a40519139af34f485f77d0 \
+    --hash=sha256:32f9a4c643baad4efa81d549c2aadefaeba12249b2adc5af541759237eee1c54 \
+    --hash=sha256:3a51ccc315653ba012774efca4f23d1d2a8a8f278a6072e29c7147eee7da446b \
+    --hash=sha256:3cde6e9f2580eb1665965ce9bf17ff4952f34f5b126beb509fee8f4e994f143c \
+    --hash=sha256:40291b1b89ca6ad8d3f2b82782cc33807f1406cf68c8d440861da6304d8ffbbd \
+    --hash=sha256:41758407fc32d5c3c5de163888068cfee69cb4c2be844e7ac517a52770f9af57 \
+    --hash=sha256:4181b814e56078e9b00427ca358ec44333765f5ca1b45597ec7446d3a1ef6e34 \
+    --hash=sha256:4f51f88c126370dcec4908576c5a627220da6c09d0bff31cfa89f2523843316d \
+    --hash=sha256:50153825ee016b91549962f970d6a4442fa106832e14c918acd1c8e479916c4f \
+    --hash=sha256:5056b185ca113c88e18223183aa1a50e66507769c9640a6ff75859619d73957b \
+    --hash=sha256:5071b2093e793357c9d8b2929dfc13ac5f0a6c650559503bb81189d0a3814519 \
+    --hash=sha256:525eab0b789891ac3be914d36893bdf972d483fe66551f79d3e27146191a37d4 \
+    --hash=sha256:52fb28f528778f184f870b7cf8f225f5eef0a8f6e3778529bdd40c7b3920796a \
+    --hash=sha256:5478c6962ad548b54a591778e93cd7c456a7a29f8eca9c49e4f9a806dcc5d638 \
+    --hash=sha256:5670bce7b200273eee1840ef307bfa07cda90b38ae56e9a6ebcc9f50da9c469b \
+    --hash=sha256:5704e174f8ccab2026bd2f1ab6c510345ae8eac818b613d7d73e785f1310f839 \
+    --hash=sha256:59dfe1ed21aea057a65c6b586afd2a945de04fc7db3de0a6e3ed5397ad491b07 \
+    --hash=sha256:5e7e351589da0850c125f1600a4c4ba3c722efefe16b297de54300f08d734fbf \
+    --hash=sha256:63b13cfd72e9601125027202cad74995ab26921d8cd935c25f09c630436348ff \
+    --hash=sha256:658f90550f38270639e83ce492f27d2c8d2cd63805c65a13a14d36ca126753f0 \
+    --hash=sha256:684d7a212682996d21ca12ef3c17353c021fe9de6049e19ac8481ec35574a70f \
+    --hash=sha256:69ab78f848845569401469da20df3e081e6b5a11cb086de3eed1d48f5ed57c95 \
+    --hash=sha256:6f44ec28b1f858c98d3036ad5d7d0bfc568bdd7a74f9c24e25f41ef1ebfd81a4 \
+    --hash=sha256:70b7fa6606c2881c1db9479b0eaa11ed5dfa11c8d60a474ff0e095099f39d98e \
+    --hash=sha256:764e71f22ab3b305e7f4c21f1a97e1526a25ebdd22513e251cf376760213da13 \
+    --hash=sha256:7ab159b063c52a0333c884e4679f8d7a85112ee3078fe3d9004b2dd875585519 \
+    --hash=sha256:805e6b60c54bf766b251e94526ebad60b7de0c70f70a4e6210ee2891acb70bf2 \
+    --hash=sha256:8447d2d39b5abe381419319f942de20b7ecd60ce86f16a23b0698f22e1b70008 \
+    --hash=sha256:86fddba590aad9208e2fa8b43b4c098bb0ec74f15718bb6a704e3c63e2cef3e9 \
+    --hash=sha256:89d75e7293d2b3e674db7d4d9b1bee7f8f3d1609428e293771d1a962617150cc \
+    --hash=sha256:93c0b12d3d3bc25af4ebbf38f9ee780a487e8bf6954c115b9f015822d3bb8e48 \
+    --hash=sha256:94d87b689cdd831934fa3ce16cc15cd65748e6d689f5d2b8f4f4df2065c9fa20 \
+    --hash=sha256:9714398225f299aa85267fd222f7142fcb5c769e73d7733344efc46f2ef5cf89 \
+    --hash=sha256:982e6d21414e78e1f51cf595d7f321dcd14de1f2881c5dc6a6e23bbbbd68435e \
+    --hash=sha256:997d6a487ff00807ba810e0f8332c18b4eb8d29463cfb7c820dc4b6e7562d0cf \
+    --hash=sha256:a03e02f48cd1abbd9f3b7e3586d97c8f7a9721c436f51a5245b3b9483044480b \
+    --hash=sha256:a36fdf2af13c2b14738f6e973aba563623cb77d753bbbd8d414d18bfaa3105dd \
+    --hash=sha256:a6ba92c0bcdf96cbf43a12c717eae4bc98325ca3730f6b130ffa2e3c3c723d84 \
+    --hash=sha256:a7c2155f790e2fb448faed6dd241386719802296ec588a8b9051c1f5c481bc29 \
+    --hash=sha256:a93c194e2df18f7d264092dc8539b8ffb86b45b899ab976aa15d48214138e81b \
+    --hash=sha256:abfa5080c374a76a251ba60683242bc17eeb2c9818d0d30117b4486be10c59d3 \
+    --hash=sha256:ac10f2c4184420d881a3475fb2c6f4d95d53a8d50209a2500723d831036f7c45 \
+    --hash=sha256:ad182d02e40de7459b73155deb8996bbd8e96852267879396fb274e8700190e3 \
+    --hash=sha256:b2837718570f95dd41675328e111345f9b7095d821bac435aac173ac80b19983 \
+    --hash=sha256:b489578720afb782f6ccf2840920f3a32e31ba28a4b162e13900c3e6bd3f930e \
+    --hash=sha256:b583904576650166b3d920d2bcce13971f6f9e9a396c673187f49811b2769dc7 \
+    --hash=sha256:b85c2530be953a890eaffde05485238f07029600e8f098cdf1848d414a8b45e4 \
+    --hash=sha256:b97c1e0bd37c5cd7902e65f410779d39eeda155800b65fc4d04cc432efa9bc6e \
+    --hash=sha256:ba9b72e5643641b7d41fa1f6d5abda2c9a263ae835b917348fc3c928182ad467 \
+    --hash=sha256:bb26437975da7dc36b7efad18aa9dd4ea569d2357ae6b783bf1118dabd9ea577 \
+    --hash=sha256:bb8f74f2f10dbf13a0be8de623ba4f9491faf58c24064f32b65679b021ed0001 \
+    --hash=sha256:bde01f35767c4a7899b7eb6e823b125a64de314a8ee9791367c9a34d56af18d0 \
+    --hash=sha256:bec9931dfb61ddd8ef2ebc05646293812cb6b16b60cf7c9511a832b6f1854b55 \
+    --hash=sha256:c36f9b6f5f8649bb251a5f3f66564438977b7ef8386a52460ae77e6070d309d9 \
+    --hash=sha256:cdf58d0e516ee426a48f7b2c03a332a4114420716d55769ff7108c37a09951bf \
+    --hash=sha256:d1cee317bfc014c2419a76bcc87f071405e3966da434e03e13beb45f8aced1a6 \
+    --hash=sha256:d22326fcdef5e08c154280b71163ced384b428343ae16a5ab2b3354aed12436e \
+    --hash=sha256:d3660c82f209655a06b587d55e723f0b813d3a7db2e32e5e7dc64ac2a9e86fde \
+    --hash=sha256:da8f5fc57d1933de22a9e23eec290a0d8a5927a5370d24bda9a6abe50683fe62 \
+    --hash=sha256:df951c5f4a1b1910f1a99ff42c473ff60f8225baa1cdd3539fe2819d9543e9df \
+    --hash=sha256:e5364a4502efca094731680e80009632ad6624084aff9a23ce8c8c6820de3e51 \
+    --hash=sha256:ea1bfda2f7162605f6e8178223576856b3d791109f15ea99a9f95c16a7636fb5 \
+    --hash=sha256:f02f93b92358ee3f78660e43b4b0091229260c5d5c408d17d60bf26b6c900e86 \
+    --hash=sha256:f056bf21105c2515c32372bbc057f43eb02aae2fda61052e2f7622c801f0b4e2 \
+    --hash=sha256:f1ac758ef6aebfc8943560194e9fd0fa18bcb34d89fd8bd2af18183afd8da3a2 \
+    --hash=sha256:f2a19f302cd1ce5dd01a9099aaa19cae6173306d1302a43b627f62e21cf18ac0 \
+    --hash=sha256:f654882311409afb1d780b940234208a252322c24a93b442ca714d119e68086c \
+    --hash=sha256:f65557897fc977a44ab205ea871b690adaef6b9da6afda4790a2484b04293a5f \
+    --hash=sha256:f9d1e379028e0fc2ae3654bac3cbbef81bf3fd571272a42d56c24007979bafb6 \
+    --hash=sha256:fdabbfc59f2c6edba2a6622c647b716e34e8e3867e0ab975412c5c2f79b82da2 \
+    --hash=sha256:fdd6028445d2460f33136c55eeb1f601ab06d74cb3347132e1c24250187500d9 \
+    --hash=sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91
+    # via transformers
+requests==2.32.3 \
+    --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \
+    --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6
+    # via
+    #   datasets
+    #   google-api-core
+    #   huggingface-hub
+    #   responses
+    #   tensorflow-cpu
+    #   tensorflow-datasets
+    #   transformers
+responses==0.18.0 \
+    --hash=sha256:15c63ad16de13ee8e7182d99c9334f64fd81f1ee79f90748d527c28f7ca9dd51 \
+    --hash=sha256:380cad4c1c1dc942e5e8a8eaae0b4d4edf708f4f010db8b7bcfafad1fcd254ff
+    # via datasets
+rich==13.9.4 \
+    --hash=sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098 \
+    --hash=sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90
+    # via
+    #   flax
+    #   keras
+rsa==4.9 \
+    --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \
+    --hash=sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21
+    # via
+    #   google-auth
+    #   oauth2client
+safetensors==0.5.0 \
+    --hash=sha256:0371afd84c200a80eb7103bf715108b0c3846132fb82453ae018609a15551580 \
+    --hash=sha256:20067e7a5e63f0cbc88457b2a1161e70ff73af4cc3a24bce90309430cd6f6e7e \
+    --hash=sha256:53715e4ea0ef23c08f004baae0f609a7773de7d4148727760417c6760cfd6b76 \
+    --hash=sha256:56d936028ac799e18644b08a91fd98b4b62ae3dcd0440b1cfcb56535785589f1 \
+    --hash=sha256:5ec7fc8c3d2f32ebf1c7011bc886b362e53ee0a1ec6d828c39d531fed8b325d6 \
+    --hash=sha256:6106aa835deb7263f7014f74c05842ab828d6c11d789f2e7e98f26b1a305e72d \
+    --hash=sha256:649d6a4aa34d5174ae87289068ccc2fec2a1a998ecf83425aa5a42c3eff69bcf \
+    --hash=sha256:a1349611f74f55c5ee1c1c144c536a2743c38f7d8bf60b9fc8267e0efc0591a2 \
+    --hash=sha256:a2f26afada2233576ffea6b80042c2c0a8105c164254af56168ec14299ad3122 \
+    --hash=sha256:b85565bc2f0456961a788d2f11d9d892eec46603db0e4923aa9512c2355aa727 \
+    --hash=sha256:bdf6a3e366ea8ba1a0538db6099229e95811194432c684ea28ea7ae28763b8dc \
+    --hash=sha256:c47b34c549fa1e0c655c4644da31332c61332c732c47c8dd9399347e9aac69d1 \
+    --hash=sha256:c683b9b485bee43422ba2855f72777c37647190281e03da4c8d2a69fa5336558 \
+    --hash=sha256:debff88f41d569a3e93a955469f83864e432af35bb34b16f65a9ddf378daa3ae \
+    --hash=sha256:f451941f8aa11e7be5c3fa450e264609a2b1e65fa38ae590a74e55a94d646b76
+    # via transformers
+scikit-learn==1.5.2 \
+    --hash=sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445 \
+    --hash=sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3 \
+    --hash=sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de \
+    --hash=sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6 \
+    --hash=sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0 \
+    --hash=sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6 \
+    --hash=sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8 \
+    --hash=sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1 \
+    --hash=sha256:3bed4909ba187aca80580fe2ef370d9180dcf18e621a27c4cf2ef10d279a7efe \
+    --hash=sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1 \
+    --hash=sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1 \
+    --hash=sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8 \
+    --hash=sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6 \
+    --hash=sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9 \
+    --hash=sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540 \
+    --hash=sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908 \
+    --hash=sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d \
+    --hash=sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f \
+    --hash=sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113 \
+    --hash=sha256:ca64b3089a6d9b9363cd3546f8978229dcbb737aceb2c12144ee3f70f95684b7 \
+    --hash=sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5 \
+    --hash=sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd \
+    --hash=sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12 \
+    --hash=sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675 \
+    --hash=sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1 \
+    --hash=sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a
+    # via -r requirements-dev.txt
+scipy==1.15.0 \
+    --hash=sha256:0e5b34f8894f9904cc578008d1a9467829c1817e9f9cb45e6d6eeb61d2ab7731 \
+    --hash=sha256:0fcb16eb04d84670722ce8d93b05257df471704c913cb0ff9dc5a1c31d1e9422 \
+    --hash=sha256:129f899ed275c0515d553b8d31696924e2ca87d1972421e46c376b9eb87de3d2 \
+    --hash=sha256:161f80a98047c219c257bf5ce1777c574bde36b9d962a46b20d0d7e531f86863 \
+    --hash=sha256:1b29e4fc02e155a5fd1165f1e6a73edfdd110470736b0f48bcbe48083f0eee37 \
+    --hash=sha256:1e2448acd79c6374583581a1ded32ac71a00c2b9c62dfa87a40e1dd2520be111 \
+    --hash=sha256:2240e1fd0782e62e1aacdc7234212ee271d810f67e9cd3b8d521003a82603ef8 \
+    --hash=sha256:300742e2cc94e36a2880ebe464a1c8b4352a7b0f3e36ec3d2ac006cdbe0219ac \
+    --hash=sha256:327163ad73e54541a675240708244644294cb0a65cca420c9c79baeb9648e479 \
+    --hash=sha256:351899dd2a801edd3691622172bc8ea01064b1cada794f8641b89a7dc5418db6 \
+    --hash=sha256:35c68f7044b4e7ad73a3e68e513dda946989e523df9b062bd3cf401a1a882192 \
+    --hash=sha256:36be480e512d38db67f377add5b759fb117edd987f4791cdf58e59b26962bee4 \
+    --hash=sha256:37ce9394cdcd7c5f437583fc6ef91bd290014993900643fdfc7af9b052d1613b \
+    --hash=sha256:46e91b5b16909ff79224b56e19cbad65ca500b3afda69225820aa3afbf9ec020 \
+    --hash=sha256:4e08c6a36f46abaedf765dd2dfcd3698fa4bd7e311a9abb2d80e33d9b2d72c34 \
+    --hash=sha256:52475011be29dfcbecc3dfe3060e471ac5155d72e9233e8d5616b84e2b542054 \
+    --hash=sha256:5972e3f96f7dda4fd3bb85906a17338e65eaddfe47f750e240f22b331c08858e \
+    --hash=sha256:5abbdc6ede5c5fed7910cf406a948e2c0869231c0db091593a6b2fa78be77e5d \
+    --hash=sha256:5beb0a2200372b7416ec73fdae94fe81a6e85e44eb49c35a11ac356d2b8eccc6 \
+    --hash=sha256:61513b989ee8d5218fbeb178b2d51534ecaddba050db949ae99eeb3d12f6825d \
+    --hash=sha256:6d26f17c64abd6c6c2dfb39920f61518cc9e213d034b45b2380e32ba78fde4c0 \
+    --hash=sha256:6f376d7c767731477bac25a85d0118efdc94a572c6b60decb1ee48bf2391a73b \
+    --hash=sha256:767e8cf6562931f8312f4faa7ddea412cb783d8df49e62c44d00d89f41f9bbe8 \
+    --hash=sha256:82bff2eb01ccf7cea8b6ee5274c2dbeadfdac97919da308ee6d8e5bcbe846443 \
+    --hash=sha256:952d2e9eaa787f0a9e95b6e85da3654791b57a156c3e6609e65cc5176ccfe6f2 \
+    --hash=sha256:9c8254fe21dd2c6c8f7757035ec0c31daecf3bb3cffd93bc1ca661b731d28136 \
+    --hash=sha256:aeac60d3562a7bf2f35549bdfdb6b1751c50590f55ce7322b4b2fc821dc27fca \
+    --hash=sha256:b1432102254b6dc7766d081fa92df87832ac25ff0b3d3a940f37276e63eb74ff \
+    --hash=sha256:bdca4c7bb8dc41307e5f39e9e5d19c707d8e20a29845e7533b3bb20a9d4ccba0 \
+    --hash=sha256:c9624eeae79b18cab1a31944b5ef87aa14b125d6ab69b71db22f0dbd962caf1e \
+    --hash=sha256:ccb6248a9987193fe74363a2d73b93bc2c546e0728bd786050b7aef6e17db03c \
+    --hash=sha256:cd9d9198a7fd9a77f0eb5105ea9734df26f41faeb2a88a0e62e5245506f7b6df \
+    --hash=sha256:d13bbc0658c11f3d19df4138336e4bce2c4fbd78c2755be4bf7b8e235481557f \
+    --hash=sha256:d35aef233b098e4de88b1eac29f0df378278e7e250a915766786b773309137c4 \
+    --hash=sha256:de112c2dae53107cfeaf65101419662ac0a54e9a088c17958b51c95dac5de56d \
+    --hash=sha256:e9baff912ea4f78a543d183ed6f5b3bea9784509b948227daaf6f10727a0e2e5 \
+    --hash=sha256:eb1533c59f0ec6c55871206f15a5c72d1fae7ad3c0a8ca33ca88f7c309bbbf8c \
+    --hash=sha256:ec915cd26d76f6fc7ae8522f74f5b2accf39546f341c771bb2297f3871934a52 \
+    --hash=sha256:fde0f3104dfa1dfbc1f230f65506532d0558d43188789eaf68f97e106249a913 \
+    --hash=sha256:fe00169cf875bed0b3c40e4da45b57037dc21d7c7bf0c85ed75f210c281488f1
+    # via
+    #   jax
+    #   jaxlib
+    #   mizani
+    #   plotnine
+    #   scikit-learn
+    #   statsmodels
+simple-parsing==0.1.6 \
+    --hash=sha256:2a6e74b061fb754cc441559e8dcea9d108286d9e0ffaa9cca4eea6bbe85372e1 \
+    --hash=sha256:dad192e9633515a5627e343106636590a39a5ce85f6c47ced43507044ed98956
+    # via tensorflow-datasets
+simplejson==3.19.3 \
+    --hash=sha256:01c6657485393f2e9b8177c77a7634f13ebe70d5e6de150aae1677d91516ce6b \
+    --hash=sha256:0552eb06e7234da892e1d02365cd2b7b2b1f8233aa5aabdb2981587b7cc92ea0 \
+    --hash=sha256:06662392e4913dc8846d6a71a6d5de86db5fba244831abe1dd741d62a4136764 \
+    --hash=sha256:0733ecd95ae03ae718ec74aad818f5af5f3155d596f7b242acbc1621e765e5fb \
+    --hash=sha256:0766ca6222b410e08e0053a0dda3606cafb3973d5d00538307f631bb59743396 \
+    --hash=sha256:0791f64fed7d4abad639491f8a6b1ba56d3c604eb94b50f8697359b92d983f36 \
+    --hash=sha256:08f9b443a94e72dd02c87098c96886d35790e79e46b24e67accafbf13b73d43b \
+    --hash=sha256:0959e6cb62e3994b5a40e31047ff97ef5c4138875fae31659bead691bed55896 \
+    --hash=sha256:0a32859d45d7b85fb803bb68f6bee14526991a1190269116c33399fa0daf9bbf \
+    --hash=sha256:0b5ddd2c7d1d3f4d23224bc8a04bbf1430ae9a8149c05b90f8fc610f7f857a23 \
+    --hash=sha256:0bc5544e3128891bf613b9f71813ee2ec9c11574806f74dd8bb84e5e95bf64a2 \
+    --hash=sha256:101a3c8392028cd704a93c7cba8926594e775ca3c91e0bee82144e34190903f1 \
+    --hash=sha256:1069143a8fb3905e1bc0696c62be7e3adf812e9f1976ac9ae15b05112ff57cc9 \
+    --hash=sha256:1773cabfba66a6337b547e45dafbd471b09487370bcab75bd28f626520410d29 \
+    --hash=sha256:1a53a07320c5ff574d8b1a89c937ce33608832f166f39dff0581ac43dc979abd \
+    --hash=sha256:1bd41f2cb1a2c57656ceff67b12d005cb255c728265e222027ad73193a04005a \
+    --hash=sha256:1c49eeb94b8f09dc8a5843c156a22b8bde6aa1ddc65ca8ddc62dddcc001e6a2d \
+    --hash=sha256:1df0aaf1cb787fdf34484ed4a1f0c545efd8811f6028623290fef1a53694e597 \
+    --hash=sha256:1e557712fc79f251673aeb3fad3501d7d4da3a27eff0857af2e1d1afbbcf6685 \
+    --hash=sha256:1e662336db50ad665777e6548b5076329a94a0c3d4a0472971c588b3ef27de3a \
+    --hash=sha256:212fce86a22188b0c7f53533b0f693ea9605c1a0f02c84c475a30616f55a744d \
+    --hash=sha256:23228037dc5d41c36666384062904d74409a62f52283d9858fa12f4c22cffad1 \
+    --hash=sha256:23833ee7e791ec968b744dfee2a2d39df7152050051096caf4296506d75608d8 \
+    --hash=sha256:256e09d0f94d9c3d177d9e95fd27a68c875a4baa2046633df387b86b652f5747 \
+    --hash=sha256:2876027ebdd599d730d36464debe84619b0368e9a642ca6e7c601be55aed439e \
+    --hash=sha256:2a6a750d3c7461b1c47cfc6bba8d9e57a455e7c5f80057d2a82f738040dd1129 \
+    --hash=sha256:2a954b30810988feeabde843e3263bf187697e0eb5037396276db3612434049b \
+    --hash=sha256:2b737a5fefedb8333fa50b8db3dcc9b1d18fd6c598f89fa7debff8b46bf4e511 \
+    --hash=sha256:2c78293470313aefa9cfc5e3f75ca0635721fb016fb1121c1c5b0cb8cc74712a \
+    --hash=sha256:2f56eb03bc9e432bb81adc8ecff2486d39feb371abb442964ffb44f6db23b332 \
+    --hash=sha256:32a3ada8f3ea41db35e6d37b86dade03760f804628ec22e4fe775b703d567426 \
+    --hash=sha256:37105d1d708365b91165e1a6e505bdecc88637091348cf4b6adcdcb4f5a5fb8b \
+    --hash=sha256:3bbcdc438dc1683b35f7a8dc100960c721f922f9ede8127f63bed7dfded4c64c \
+    --hash=sha256:3dc5c1a85ff388e98ea877042daec3d157b6db0d85bac6ba5498034689793e7e \
+    --hash=sha256:42e5acf80d4d971238d4df97811286a044d720693092b20a56d5e56b7dcc5d09 \
+    --hash=sha256:49549e3d81ab4a58424405aa545602674d8c35c20e986b42bb8668e782a94bac \
+    --hash=sha256:49cc4c7b940d43bd12bf87ec63f28cbc4964fc4e12c031cc8cd01650f43eb94e \
+    --hash=sha256:4a0710d1a5e41c4f829caa1572793dd3130c8d65c2b194c24ff29c4c305c26e0 \
+    --hash=sha256:4dfa420bb9225dd33b6efdabde7c6a671b51150b9b1d9c4e5cd74d3b420b3fe1 \
+    --hash=sha256:50d8b742d74c449c4dcac570d08ce0f21f6a149d2d9cf7652dbf2ba9a1bc729a \
+    --hash=sha256:56134bbafe458a7b21f6fddbf889d36bec6d903718f4430768e3af822f8e27c2 \
+    --hash=sha256:5bf6a3b9a7d7191471b464fe38f684df10eb491ec9ea454003edb45a011ab187 \
+    --hash=sha256:5d9e8f836688a8fabe6a6b41b334aa550a6823f7b4ac3d3712fc0ad8655be9a8 \
+    --hash=sha256:619756f1dd634b5bdf57d9a3914300526c3b348188a765e45b8b08eabef0c94e \
+    --hash=sha256:6300680d83a399be2b8f3b0ef7ef90b35d2a29fe6e9c21438097e0938bbc1564 \
+    --hash=sha256:637c4d4b81825c1f4d651e56210bd35b5604034b192b02d2d8f17f7ce8c18f42 \
+    --hash=sha256:66a0399e21c2112acacfebf3d832ebe2884f823b1c7e6d1363f2944f1db31a99 \
+    --hash=sha256:67a20641afebf4cfbcff50061f07daad1eace6e7b31d7622b6fa2c40d43900ba \
+    --hash=sha256:6890ff9cf0bd2e1d487e2a8869ebd620a44684c0a9667fa5ee751d099d5d84c8 \
+    --hash=sha256:6d43e24b88c80f997081503f693be832fc90854f278df277dd54f8a4c847ab61 \
+    --hash=sha256:6ef9383c5e05f445be60f1735c1816163c874c0b1ede8bb4390aff2ced34f333 \
+    --hash=sha256:6f455672f4738b0f47183c5896e3606cd65c9ddee3805a4d18e8c96aa3f47c84 \
+    --hash=sha256:6fea0716c593dabb4392c4996d4e902a83b2428e6da82938cf28a523a11eb277 \
+    --hash=sha256:7017329ca8d4dca94ad5e59f496e5fc77630aecfc39df381ffc1d37fb6b25832 \
+    --hash=sha256:7137e69c6781ecf23afab064be94a277236c9cba31aa48ff1a0ec3995c69171e \
+    --hash=sha256:72e8abbc86fcac83629a030888b45fed3a404d54161118be52cb491cd6975d3e \
+    --hash=sha256:7355c7203353c36d46c4e7b6055293b3d2be097bbc5e2874a2b8a7259f0325dd \
+    --hash=sha256:76f8c28fe2d426182405b18ddf3001fce47835a557dc15c3d8bdea01c03361da \
+    --hash=sha256:7923878b7a0142d39763ec2dbecff3053c1bedd3653585a8474666e420fe83f5 \
+    --hash=sha256:7a7bfad839c624e139a4863007233a3f194e7c51551081f9789cba52e4da5167 \
+    --hash=sha256:7b5c472099b39b274dcde27f1113db8d818c9aa3ba8f78cbb8ad04a4c1ac2118 \
+    --hash=sha256:7c0104b4b7d2c75ccedbf1d9d5a3bd2daa75e51053935a44ba012e2fd4c43752 \
+    --hash=sha256:7e062767ac165df9a46963f5735aa4eee0089ec1e48b3f2ec46182754b96f55e \
+    --hash=sha256:7e2a098c21ad8924076a12b6c178965d88a0ad75d1de67e1afa0a66878f277a5 \
+    --hash=sha256:817abad79241ed4a507b3caf4d3f2be5079f39d35d4c550a061988986bffd2ec \
+    --hash=sha256:83c87706265ae3028e8460d08b05f30254c569772e859e5ba61fe8af2c883468 \
+    --hash=sha256:89b35433186e977fa86ff1fd179c1fadff39cfa3afa1648dab0b6ca53153acd9 \
+    --hash=sha256:8e086896c36210ab6050f2f9f095a5f1e03c83fa0e7f296d6cba425411364680 \
+    --hash=sha256:8f41bb5370b34f63171e65fdb00e12be1d83675cecb23e627df26f4c88dfc021 \
+    --hash=sha256:934a50a614fb831614db5dbfba35127ee277624dda4d15895c957d2f5d48610c \
+    --hash=sha256:93be280fc69a952c76e261036312c20b910e7fa9e234f1d89bdfe3fa34f8a023 \
+    --hash=sha256:951095be8d4451a7182403354c22ec2de3e513e0cc40408b689af08d02611588 \
+    --hash=sha256:a0782cb9bf827f0c488b6aa0f2819f618308a3caf2973cfd792e45d631bec4db \
+    --hash=sha256:ab69f811a660c362651ae395eba8ce84f84c944cea0df5718ea0ba9d1e4e7252 \
+    --hash=sha256:ad0e0b1ce9bd3edb5cf64b5b5b76eacbfdac8c5367153aeeec8a8b1407f68342 \
+    --hash=sha256:add8850db04b98507a8b62d248a326ecc8561e6d24336d1ca5c605bbfaab4cad \
+    --hash=sha256:afab2f7f2486a866ff04d6d905e9386ca6a231379181a3838abce1f32fbdcc37 \
+    --hash=sha256:b5587feda2b65a79da985ae6d116daf6428bf7489992badc29fc96d16cd27b05 \
+    --hash=sha256:b9198c1f1f8910a3b86b60f4fe2556d9d28d3fefe35bffe6be509a27402e694d \
+    --hash=sha256:bc164f32dd9691e7082ce5df24b4cf8c6c394bbf9bdeeb5d843127cd07ab8ad2 \
+    --hash=sha256:bcde83a553a96dc7533736c547bddaa35414a2566ab0ecf7d3964fc4bdb84c11 \
+    --hash=sha256:c40df31a75de98db2cdfead6074d4449cd009e79f54c1ebe5e5f1f153c68ad20 \
+    --hash=sha256:c4f614581b61a26fbbba232a1391f6cee82bc26f2abbb6a0b44a9bba25c56a1c \
+    --hash=sha256:c9bedebdc5fdad48af8783022bae307746d54006b783007d1d3c38e10872a2c6 \
+    --hash=sha256:cb324bb903330cbb35d87cce367a12631cd5720afa06e5b9c906483970946da6 \
+    --hash=sha256:d00313681015ac498e1736b304446ee6d1c72c5b287cd196996dad84369998f7 \
+    --hash=sha256:d0b0efc7279d768db7c74d3d07f0b5c81280d16ae3fb14e9081dc903e8360771 \
+    --hash=sha256:d0d5a63f1768fed7e78cf55712dee81f5a345e34d34224f3507ebf71df2b754d \
+    --hash=sha256:d1b8b4d6379fe55f471914345fe6171d81a18649dacf3248abfc9c349b4442eb \
+    --hash=sha256:d36608557b4dcd7a62c29ad4cd7c5a1720bbf7dc942eff9dc42d2c542a5f042d \
+    --hash=sha256:d43c2d7504eda566c50203cdc9dc043aff6f55f1b7dae0dcd79dfefef9159d1c \
+    --hash=sha256:d73efb03c5b39249c82488a994f0998f9e4399e3d085209d2120503305ba77a8 \
+    --hash=sha256:d936ae682d5b878af9d9eb4d8bb1fdd5e41275c8eb59ceddb0aeed857bb264a2 \
+    --hash=sha256:dd011fc3c1d88b779645495fdb8189fb318a26981eebcce14109460e062f209b \
+    --hash=sha256:dd5b9b1783e14803e362a558680d88939e830db2466f3fa22df5c9319f8eea94 \
+    --hash=sha256:dd6a7dabcc4c32daf601bc45e01b79175dde4b52548becea4f9545b0a4428169 \
+    --hash=sha256:dd7230d061e755d60a4d5445bae854afe33444cdb182f3815cff26ac9fb29a15 \
+    --hash=sha256:e0d2b00ecbcd1a3c5ea1abc8bb99a26508f758c1759fd01c3be482a3655a176f \
+    --hash=sha256:e1a1452ad5723ff129b081e3c8aa4ba56b8734fee4223355ed7b815a7ece69bc \
+    --hash=sha256:e88abff510dcff903a18d11c2a75f9964e768d99c8d147839913886144b2065e \
+    --hash=sha256:ea7a4a998c87c5674a27089e022110a1a08a7753f21af3baf09efe9915c23c3c \
+    --hash=sha256:eb47ee773ce67476a960e2db4a0a906680c54f662521550828c0cc57d0099426 \
+    --hash=sha256:eed8cd98a7b24861da9d3d937f5fbfb6657350c547528a117297fe49e3960667 \
+    --hash=sha256:ef28c3b328d29b5e2756903aed888960bc5df39b4c2eab157ae212f70ed5bf74 \
+    --hash=sha256:ef59a53be400c1fad2c914b8d74c9d42384fed5174f9321dd021b7017fd40270 \
+    --hash=sha256:f39caec26007a2d0efab6b8b1d74873ede9351962707afab622cc2285dd26ed0 \
+    --hash=sha256:f8efb03ca77bd7725dfacc9254df00d73e6f43013cf39bd37ef1a8ed0ebb5165 \
+    --hash=sha256:fa97278ae6614346b5ca41a45a911f37a3261b57dbe4a00602048652c862c28b \
+    --hash=sha256:fc3dc9fb413fc34c396f52f4c87de18d0bd5023804afa8ab5cc224deeb6a9900 \
+    --hash=sha256:ff7bc1bbdaa3e487c9469128bf39408e91f5573901cb852e03af378d3582c52d
+    # via orbax-checkpoint
+six==1.17.0 \
+    --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \
+    --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81
+    # via
+    #   astunparse
+    #   google-api-core
+    #   google-api-python-client
+    #   google-auth
+    #   google-pasta
+    #   oauth2client
+    #   promise
+    #   python-dateutil
+    #   tensorboard
+    #   tensorflow-cpu
+statsmodels==0.14.4 \
+    --hash=sha256:1322286a7bfdde2790bf72d29698a1b76c20b8423a55bdcd0d457969d0041f72 \
+    --hash=sha256:17672b30c6b98afe2b095591e32d1d66d4372f2651428e433f16a3667f19eabb \
+    --hash=sha256:2a337b731aa365d09bb0eab6da81446c04fde6c31976b1d8e3d3a911f0f1e07b \
+    --hash=sha256:3bb2e580d382545a65f298589809af29daeb15f9da2eb252af8f79693e618abc \
+    --hash=sha256:46ac7ddefac0c9b7b607eed1d47d11e26fe92a1bc1f4d9af48aeed4e21e87981 \
+    --hash=sha256:4793b01b7a5f5424f5a1dbcefc614c83c7608aa2b035f087538253007c339d5d \
+    --hash=sha256:4bbb150620b53133d6cd1c5d14c28a4f85701e6c781d9b689b53681effaa655f \
+    --hash=sha256:5221dba7424cf4f2561b22e9081de85f5bb871228581124a0d1b572708545199 \
+    --hash=sha256:5d69e0f39060dc72c067f9bb6e8033b6dccdb0bae101d76a7ef0bcc94e898b67 \
+    --hash=sha256:5ed7e118e6e3e02d6723a079b8c97eaadeed943fa1f7f619f7148dfc7862670f \
+    --hash=sha256:631bb52159117c5da42ba94bd94859276b68cab25dc4cac86475bc24671143bc \
+    --hash=sha256:6e9ddefba1d4e1107c1f20f601b0581421ea3ad9fd75ce3c2ba6a76b6dc4682c \
+    --hash=sha256:6f43da7957e00190104c5dd0f661bfc6dfc68b87313e3f9c4dbd5e7d222e0aeb \
+    --hash=sha256:7a62f1fc9086e4b7ee789a6f66b3c0fc82dd8de1edda1522d30901a0aa45e42b \
+    --hash=sha256:7f7917a51766b4e074da283c507a25048ad29a18e527207883d73535e0dc6184 \
+    --hash=sha256:81030108d27aecc7995cac05aa280cf8c6025f6a6119894eef648997936c2dd0 \
+    --hash=sha256:8286f69a5e1d0e0b366ffed5691140c83d3efc75da6dbf34a3d06e88abfaaab6 \
+    --hash=sha256:91341cbde9e8bea5fb419a76e09114e221567d03f34ca26e6d67ae2c27d8fe3c \
+    --hash=sha256:9729642884147ee9db67b5a06a355890663d21f76ed608a56ac2ad98b94d201a \
+    --hash=sha256:a6087ecb0714f7c59eb24c22781491e6f1cfffb660b4740e167625ca4f052056 \
+    --hash=sha256:aa74aaa26eaa5012b0a01deeaa8a777595d0835d3d6c7175f2ac65435a7324d2 \
+    --hash=sha256:ab5e6312213b8cfb9dca93dd46a0f4dccb856541f91d3306227c3d92f7659245 \
+    --hash=sha256:b5a24f5d2c22852d807d2b42daf3a61740820b28d8381daaf59dcb7055bf1a79 \
+    --hash=sha256:bb695c2025d122a101c2aca66d2b78813c321b60d3a7c86bb8ec4467bb53b0f9 \
+    --hash=sha256:d330da34f59f1653c5193f9fe3a3a258977c880746db7f155fc33713ea858db5 \
+    --hash=sha256:d9c8fa28dfd75753d9cf62769ba1fecd7e73a0be187f35cc6f54076f98aa3f3f \
+    --hash=sha256:df4f7864606fa843d7e7c0e6af288f034a2160dba14e6ccc09020a3cf67cb092 \
+    --hash=sha256:e31b95ac603415887c9f0d344cb523889cf779bc52d68e27e2d23c358958fec7 \
+    --hash=sha256:e332c2d9b806083d1797231280602340c5c913f90d4caa0213a6a54679ce9331 \
+    --hash=sha256:f5f537f7d000de4a1708c63400755152b862cd4926bb81a86568e347c19c364b
+    # via plotnine
+sympy==1.13.3 \
+    --hash=sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73 \
+    --hash=sha256:b27fd2c6530e0ab39e275fc9b683895367e51d5da91baa8d3d64db2565fec4d9
+    # via torch
+tabulate==0.9.0 \
+    --hash=sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c \
+    --hash=sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f
+    # via dm-haiku
+tensorboard==2.18.0 \
+    --hash=sha256:107ca4821745f73e2aefa02c50ff70a9b694f39f790b11e6f682f7d326745eab
+    # via tensorflow-cpu
+tensorboard-data-server==0.7.2 \
+    --hash=sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb \
+    --hash=sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60 \
+    --hash=sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530
+    # via tensorboard
+tensorflow-cpu==2.18.0 ; sys_platform == "linux" and platform_machine == "x86_64" \
+    --hash=sha256:089e71746960ea581dca53401f84b3b99c8537313e337a9e5dbf97036a936f7e \
+    --hash=sha256:0b093b727c2f2a8cf4ee4f2c7352c8e958a2a1d27a452961b8d5f43a0798dcd2 \
+    --hash=sha256:107775c86045c7a3042cf9d79046be49cd68a3278f5d9f8c75158c049259307c \
+    --hash=sha256:39bd421ad125e4163d6e2d41ab0e158b583fb5c6f9254522fb87635b0e70b891 \
+    --hash=sha256:482dc228f513c65bbe34abbb7c456765bd8e1599dbeae0924fc96d97578f9987 \
+    --hash=sha256:9f98466fde4a307d39b71309b1a9a8a4f1bb972e70d36b1ead0817405cc518f2 \
+    --hash=sha256:c2dc132be3215f4f8d56766bf8b6645fa6d45dea70b46ae092c5cdd3e958c0ac \
+    --hash=sha256:c61ef38e24686cd460e31f988ea36f73092ccc628193df95116d6c5f690393f9
+    # via -r requirements-dev.txt
+tensorflow-datasets==4.9.7 \
+    --hash=sha256:948d7a68ee693d9709cbfe955f97344b98647e5512e0e42dd2a61f3e7925d68b \
+    --hash=sha256:f6fdfe745b5df2a37bde8bb2ef149188ebbca4af546226b461f3bad26bc42875
+    # via -r requirements-dev.txt
+tensorflow-io-gcs-filesystem==0.37.1 \
+    --hash=sha256:0df00891669390078a003cedbdd3b8e645c718b111917535fa1d7725e95cdb95 \
+    --hash=sha256:249c12b830165841411ba71e08215d0e94277a49c551e6dd5d72aab54fe5491b \
+    --hash=sha256:257aab23470a0796978efc9c2bcf8b0bc80f22e6298612a4c0a50d3f4e88060c \
+    --hash=sha256:286389a203a5aee1a4fa2e53718c661091aa5fea797ff4fa6715ab8436b02e6c \
+    --hash=sha256:32c50ab4e29a23c1f91cd0f9ab8c381a0ab10f45ef5c5252e94965916041737c \
+    --hash=sha256:426de1173cb81fbd62becec2012fc00322a295326d90eb6c737fab636f182aed \
+    --hash=sha256:6e1f2796b57e799a8ca1b75bf47c2aaa437c968408cc1a402a9862929e104cda \
+    --hash=sha256:8943036bbf84e7a2be3705cb56f9c9df7c48c9e614bb941f0936c58e3ca89d6f \
+    --hash=sha256:8febbfcc67c61e542a5ac1a98c7c20a91a5e1afc2e14b1ef0cb7c28bc3b6aa70 \
+    --hash=sha256:9679b36e3a80921876f31685ab6f7270f3411a4cc51bc2847e80d0e4b5291e27 \
+    --hash=sha256:b02f9c5f94fd62773954a04f69b68c4d576d076fd0db4ca25d5479f0fbfcdbad \
+    --hash=sha256:ee5da49019670ed364f3e5fb86b46420841a6c3cb52a300553c63841671b3e6d \
+    --hash=sha256:ee7c8ee5fe2fd8cb6392669ef16e71841133041fee8a330eff519ad9b36e4556 \
+    --hash=sha256:fbb33f1745f218464a59cecd9a18e32ca927b0f4d77abd8f8671b645cc1a182f \
+    --hash=sha256:fe8dcc6d222258a080ac3dfcaaaa347325ce36a7a046277f6b3e19abc1efb3c5 \
+    --hash=sha256:ffebb6666a7bfc28005f4fbbb111a455b5e7d6cd3b12752b7050863ecb27d5cc
+    # via tensorflow-cpu
+tensorflow-metadata==1.16.1 \
+    --hash=sha256:2ce72ea31d78a00c0c74c6d465482335aa5cb2a3b2a104dedba0b258bc7bb18a
+    # via tensorflow-datasets
+tensorstore==0.1.71 \
+    --hash=sha256:0bd87899e1c6049b078e785e8b7871e2579202f5b929e89c3c37340965b922bb \
+    --hash=sha256:1a6cdcc52e4b841d23e50a2fa28e016e6d9f61d6ea9188d4555ea189b040a0f6 \
+    --hash=sha256:31e39ed7d374f43e45bff52611bad99315c577b44c099b2f6837b801b3467645 \
+    --hash=sha256:321d6302e5116b20fda500821240eba7de28477209070728d98edefced97d2b5 \
+    --hash=sha256:373558b803d8c2c57fc613b11007ae58139f19a3cddd443a0de5d7b5321e5961 \
+    --hash=sha256:46ff0f41ef3b1dbd1a925d62e6475523a587bcd37b277bf4f633f46f5b7e22bd \
+    --hash=sha256:52b546f076b2c3bf217c60f05de4124cc1197ce92f8e826e7ec73ae324074a5a \
+    --hash=sha256:583f0ec143062176ca21fe8dcc3b3b6f94d7f4ea643443b49942d3d1a2fa29b4 \
+    --hash=sha256:5c37c7b385517b568282a7aedded446216335d0cb41187c93c80b53596c92c96 \
+    --hash=sha256:6276e279b45eb5d9b95c4df3e7956255f414fd4b128d2de16d8aecde86c36357 \
+    --hash=sha256:65c3a1a2a35a1b537403f36403d258caab477e564bc0f64109b941cc77b4f203 \
+    --hash=sha256:75a9ff1f7b6759094cc210baa4e8135c4898472e08a7036476374433d03c6a34 \
+    --hash=sha256:87a97a34b0475ddc7d2afc40e5dd7f8d12522aa81edfbcccb39628cf591454d5 \
+    --hash=sha256:95041b55a2ec86d1f6690512d1883581b18f2f4f46c3d97894aeb0ac2db6af7f \
+    --hash=sha256:b961bbbb7a1c6a48e4c1406a98caebeb400461e2e75a08b6df0c013294037a15 \
+    --hash=sha256:ced5430bcdfa7fcb3a6bdc44733176158cb877b35bdd233cac82e25b4cc94e92 \
+    --hash=sha256:d3a24feb6195f1c222162965c0107c9ff56d322cca23e19f0e66636f6eb80f14 \
+    --hash=sha256:de8843fb3462899de7bcdeeaccb92303a9d61006bc36364deb4a88df46320ba4 \
+    --hash=sha256:ecf4feb574051f40e81572ea2ff8e5895b2980c5dd3b29fe81c70d25e42d3b6a \
+    --hash=sha256:f3e62aa7b473c0715706a809da3591763906059e8731a38c0b495337a1dc55ea \
+    --hash=sha256:f40e73bcdc333dfb3f7fe0fcf023bcbec41533c9856657718ff76ece1a1902e0
+    # via
+    #   flax
+    #   orbax-checkpoint
+termcolor==2.5.0 \
+    --hash=sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8 \
+    --hash=sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f
+    # via
+    #   -r requirements-dev.txt
+    #   tensorflow-cpu
+    #   tensorflow-datasets
+threadpoolctl==3.5.0 \
+    --hash=sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107 \
+    --hash=sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467
+    # via scikit-learn
+tokenizers==0.21.0 \
+    --hash=sha256:089d56db6782a73a27fd8abf3ba21779f5b85d4a9f35e3b493c7bbcbbf0d539b \
+    --hash=sha256:3c4c93eae637e7d2aaae3d376f06085164e1660f89304c0ab2b1d08a406636b2 \
+    --hash=sha256:400832c0904f77ce87c40f1a8a27493071282f785724ae62144324f171377273 \
+    --hash=sha256:4145505a973116f91bc3ac45988a92e618a6f83eb458f49ea0790df94ee243ff \
+    --hash=sha256:6b177fb54c4702ef611de0c069d9169f0004233890e0c4c5bd5508ae05abf193 \
+    --hash=sha256:6b43779a269f4629bebb114e19c3fca0223296ae9fea8bb9a7a6c6fb0657ff8e \
+    --hash=sha256:87841da5a25a3a5f70c102de371db120f41873b854ba65e52bccd57df5a3780c \
+    --hash=sha256:9aeb255802be90acfd363626753fda0064a8df06031012fe7d52fd9a905eb00e \
+    --hash=sha256:c87ca3dc48b9b1222d984b6b7490355a6fdb411a2d810f6f05977258400ddb74 \
+    --hash=sha256:d8b09dbeb7a8d73ee204a70f94fc06ea0f17dcf0844f16102b9f414f0b7463ba \
+    --hash=sha256:e84ca973b3a96894d1707e189c14a774b701596d579ffc7e69debfc036a61a04 \
+    --hash=sha256:eb1702c2f27d25d9dd5b389cc1f2f51813e99f8ca30d9e25348db6585a97e24a \
+    --hash=sha256:eb7202d231b273c34ec67767378cd04c767e967fda12d4a9e36208a34e2f137e \
+    --hash=sha256:ee0894bf311b75b0c03079f33859ae4b2334d675d4e93f5a4132e1eae2834fe4 \
+    --hash=sha256:f53ea537c925422a2e0e92a24cce96f6bc5046bbef24a1652a5edc8ba975f62e
+    # via transformers
+toml==0.10.2 \
+    --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \
+    --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f
+    # via tensorflow-datasets
+toolz==1.0.0 \
+    --hash=sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236 \
+    --hash=sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02
+    # via chex
+torch==2.3.0 \
+    --hash=sha256:09c81c5859a5b819956c6925a405ef1cdda393c9d8a01ce3851453f699d3358c \
+    --hash=sha256:1bf023aa20902586f614f7682fedfa463e773e26c58820b74158a72470259459 \
+    --hash=sha256:20572f426965dd8a04e92a473d7e445fa579e09943cc0354f3e6fef6130ce061 \
+    --hash=sha256:493d54ee2f9df100b5ce1d18c96dbb8d14908721f76351e908c9d2622773a788 \
+    --hash=sha256:4fb27b35dbb32303c2927da86e27b54a92209ddfb7234afb1949ea2b3effffea \
+    --hash=sha256:5515503a193781fd1b3f5c474e89c9dfa2faaa782b2795cc4a7ab7e67de923f6 \
+    --hash=sha256:6ae9f64b09516baa4ef890af0672dc981c20b1f0d829ce115d4420a247e88fba \
+    --hash=sha256:729804e97b7cf19ae9ab4181f91f5e612af07956f35c8b2c8e9d9f3596a8e877 \
+    --hash=sha256:758ef938de87a2653bba74b91f703458c15569f1562bf4b6c63c62d9c5a0c1f5 \
+    --hash=sha256:760f8bedff506ce9e6e103498f9b1e9e15809e008368594c3a66bf74a8a51380 \
+    --hash=sha256:a306c87a3eead1ed47457822c01dfbd459fe2920f2d38cbdf90de18f23f72542 \
+    --hash=sha256:b0de2bdc0486ea7b14fc47ff805172df44e421a7318b7c4d92ef589a75d27410 \
+    --hash=sha256:bce43af735c3da16cc14c7de2be7ad038e2fbf75654c2e274e575c6c05772ace \
+    --hash=sha256:cd0dc498b961ab19cb3f8dbf0c6c50e244f2f37dbfa05754ab44ea057c944ef9 \
+    --hash=sha256:d24e328226d8e2af7cf80fcb1d2f1d108e0de32777fab4aaa2b37b9765d8be73 \
+    --hash=sha256:d8ea5a465dbfd8501f33c937d1f693176c9aef9d1c1b0ca1d44ed7b0a18c52ac \
+    --hash=sha256:dca986214267b34065a79000cee54232e62b41dff1ec2cab9abc3fc8b3dee0ad \
+    --hash=sha256:e05f836559251e4096f3786ee99f4a8cbe67bc7fbedba8ad5e799681e47c5e80 \
+    --hash=sha256:e65ba85ae292909cde0dde6369826d51165a3fc8823dc1854cd9432d7f79b932 \
+    --hash=sha256:f9b98bf1a3c8af2d4c41f0bf1433920900896c446d1ddc128290ff146d1eb4bd
+    # via
+    #   -r requirements-dev.txt
+    #   torchvision
+torch-xla==2.3.0 ; sys_platform == "linux" and platform_machine == "x86_64" \
+    --hash=sha256:262876ab0e95a4ecd131afa33a89ad7f94544f878a74198ee52fcf723af39e6f \
+    --hash=sha256:6678b2bea3baeda916cdb314d5ad190eeb388e71a4de04ccfa948ab74d6d4c72 \
+    --hash=sha256:8282e0ff92f42e18e22f65c0ec5a17acd5bc51728b1fdeb6b4ccade3a313c6ac \
+    --hash=sha256:e0b2f88baf3373b9c0a4f351488dbb9b4b007b52c1c66f65b65e1984b5f0f227
+    # via -r requirements-dev.txt
+torchvision==0.18.0 \
+    --hash=sha256:2115a1906c015f5da9ceedc40a983313b0fd6e2c8a17108a92991706f51f6987 \
+    --hash=sha256:36efd87001c6bee2383e043e46a025affb03179747c8f4777b9918527ffce756 \
+    --hash=sha256:3d7955398d4ceaad77c487c2c44f6f7813112402c9bab8cd906d346005891048 \
+    --hash=sha256:493c45f9937dad37aa1b64b14da17c7a589c72b91adc4837d431009cfe29bd53 \
+    --hash=sha256:4c334b3e719ba0a9ba6e15d4aff1178f5e6d029174f346163fed525f0ccfffd3 \
+    --hash=sha256:5337f6acfa1fe959d5cb340d01a00614d6b31ce7a4824ccb95435a85c5273b95 \
+    --hash=sha256:6323f7e5423ff2594d5891863b919deb9d0de95f01c36bf26fbd879036b6ed08 \
+    --hash=sha256:6896a52168befe1105fb3c9335287390ed227e71d1e4ec4d68b62e8a3099fc09 \
+    --hash=sha256:6ad70ddfa879bda5ed886b2518fe562640e0059787cbd65cb2bffa7674541410 \
+    --hash=sha256:75e22ecf44a13b8f95b8ad421c0261282d859c61816badaca1959e073ccdd691 \
+    --hash=sha256:7c770f0f748e0b17f57c0297508d7254f686cdf03fc2e2949f422b20574f4c0f \
+    --hash=sha256:925d0a82cccf6f986c18b29b4392a942db65cbdb73c13a129c8493822eb9e36f \
+    --hash=sha256:95b42d0dc599b47a01530c7439a5751e67e45b85e3a67113989cf7c7c70f2039 \
+    --hash=sha256:a964afbc7ddf50a46b941477f6c35729b416deedd139756befd488245e2e226d \
+    --hash=sha256:b657d052d146f24cb3b2a78219bfc82ae70a9706671c50f632528907d10cccec \
+    --hash=sha256:bd8e6f3b5beb49965f15c461302488edfa3d8c2d01d3bb79b150d6fb62711e3a \
+    --hash=sha256:ccc292e093771d5baacf5535ac4416306b6b5f15676341cd4d010d8542eace25 \
+    --hash=sha256:dd61628a3d189c6852a12dc5ed4cd2eece66d2d67f35a866cb16f1dcb06c8c62 \
+    --hash=sha256:e5a24d620cea14a4bb89f24aa2b506230c0a16a3ada57fc53ad80cfd256a2128 \
+    --hash=sha256:eb9d83c0e1dbb54ecb0fb04c87f786333e3a6fb8b9c400aca7c31081f9aa5707
+    # via -r requirements-dev.txt
+tqdm==4.67.1 \
+    --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \
+    --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2
+    # via
+    #   datasets
+    #   etils
+    #   huggingface-hub
+    #   tensorflow-datasets
+    #   transformers
+transformers==4.47.1 \
+    --hash=sha256:6c29c05a5f595e278481166539202bf8641281536df1c42357ee58a45d0a564a \
+    --hash=sha256:d2f5d19bb6283cd66c893ec7e6d931d6370bbf1cc93633326ff1f41a40046c9c
+    # via -r requirements-dev.txt
+triton==2.3.0 \
+    --hash=sha256:038e06a09c06a164fef9c48de3af1e13a63dc1ba3c792871e61a8e79720ea440 \
+    --hash=sha256:218d742e67480d9581bafb73ed598416cc8a56f6316152e5562ee65e33de01c0 \
+    --hash=sha256:381ec6b3dac06922d3e4099cfc943ef032893b25415de295e82b1a82b0359d2c \
+    --hash=sha256:3c3d9607f85103afdb279938fc1dd2a66e4f5999a58eb48a346bd42738f986dd \
+    --hash=sha256:5ce4b8ff70c48e47274c66f269cce8861cf1dc347ceeb7a67414ca151b1822d8 \
+    --hash=sha256:6d8f636e0341ac348899a47a057c3daea99ea7db31528a225a3ba4ded28ccc65
+    # via torch
+typing-extensions==4.12.2 \
+    --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \
+    --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8
+    # via
+    #   chex
+    #   etils
+    #   flax
+    #   huggingface-hub
+    #   optree
+    #   orbax-checkpoint
+    #   simple-parsing
+    #   tensorflow-cpu
+    #   torch
+tzdata==2024.2 \
+    --hash=sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc \
+    --hash=sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd
+    # via pandas
+uritemplate==3.0.1 \
+    --hash=sha256:07620c3f3f8eed1f12600845892b0e036a2420acf513c53f7de0abd911a5894f \
+    --hash=sha256:5af8ad10cec94f215e3f48112de2022e1d5a37ed427fbd88652fa908f2ab7cae
+    # via google-api-python-client
+urllib3==2.3.0 \
+    --hash=sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df \
+    --hash=sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d
+    # via
+    #   requests
+    #   responses
+werkzeug==3.1.3 \
+    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
+    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+    # via tensorboard
+wheel==0.45.1 \
+    --hash=sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729 \
+    --hash=sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248
+    # via astunparse
+wrapt==1.17.0 \
+    --hash=sha256:0229b247b0fc7dee0d36176cbb79dbaf2a9eb7ecc50ec3121f40ef443155fb1d \
+    --hash=sha256:0698d3a86f68abc894d537887b9bbf84d29bcfbc759e23f4644be27acf6da301 \
+    --hash=sha256:0a0a1a1ec28b641f2a3a2c35cbe86c00051c04fffcfcc577ffcdd707df3f8635 \
+    --hash=sha256:0b48554952f0f387984da81ccfa73b62e52817a4386d070c75e4db7d43a28c4a \
+    --hash=sha256:0f2a28eb35cf99d5f5bd12f5dd44a0f41d206db226535b37b0c60e9da162c3ed \
+    --hash=sha256:140ea00c87fafc42739bd74a94a5a9003f8e72c27c47cd4f61d8e05e6dec8721 \
+    --hash=sha256:16187aa2317c731170a88ef35e8937ae0f533c402872c1ee5e6d079fcf320801 \
+    --hash=sha256:17fcf043d0b4724858f25b8826c36e08f9fb2e475410bece0ec44a22d533da9b \
+    --hash=sha256:18b956061b8db634120b58f668592a772e87e2e78bc1f6a906cfcaa0cc7991c1 \
+    --hash=sha256:2399408ac33ffd5b200480ee858baa58d77dd30e0dd0cab6a8a9547135f30a88 \
+    --hash=sha256:2a0c23b8319848426f305f9cb0c98a6e32ee68a36264f45948ccf8e7d2b941f8 \
+    --hash=sha256:2dfb7cff84e72e7bf975b06b4989477873dcf160b2fd89959c629535df53d4e0 \
+    --hash=sha256:2f495b6754358979379f84534f8dd7a43ff8cff2558dcdea4a148a6e713a758f \
+    --hash=sha256:33539c6f5b96cf0b1105a0ff4cf5db9332e773bb521cc804a90e58dc49b10578 \
+    --hash=sha256:3c34f6896a01b84bab196f7119770fd8466c8ae3dfa73c59c0bb281e7b588ce7 \
+    --hash=sha256:498fec8da10e3e62edd1e7368f4b24aa362ac0ad931e678332d1b209aec93045 \
+    --hash=sha256:4d63f4d446e10ad19ed01188d6c1e1bb134cde8c18b0aa2acfd973d41fcc5ada \
+    --hash=sha256:4e4b4385363de9052dac1a67bfb535c376f3d19c238b5f36bddc95efae15e12d \
+    --hash=sha256:4e547b447073fc0dbfcbff15154c1be8823d10dab4ad401bdb1575e3fdedff1b \
+    --hash=sha256:4f643df3d4419ea3f856c5c3f40fec1d65ea2e89ec812c83f7767c8730f9827a \
+    --hash=sha256:4f763a29ee6a20c529496a20a7bcb16a73de27f5da6a843249c7047daf135977 \
+    --hash=sha256:5ae271862b2142f4bc687bdbfcc942e2473a89999a54231aa1c2c676e28f29ea \
+    --hash=sha256:5d8fd17635b262448ab8f99230fe4dac991af1dabdbb92f7a70a6afac8a7e346 \
+    --hash=sha256:69c40d4655e078ede067a7095544bcec5a963566e17503e75a3a3e0fe2803b13 \
+    --hash=sha256:69d093792dc34a9c4c8a70e4973a3361c7a7578e9cd86961b2bbf38ca71e4e22 \
+    --hash=sha256:6a9653131bda68a1f029c52157fd81e11f07d485df55410401f745007bd6d339 \
+    --hash=sha256:6ff02a91c4fc9b6a94e1c9c20f62ea06a7e375f42fe57587f004d1078ac86ca9 \
+    --hash=sha256:714c12485aa52efbc0fc0ade1e9ab3a70343db82627f90f2ecbc898fdf0bb181 \
+    --hash=sha256:7264cbb4a18dc4acfd73b63e4bcfec9c9802614572025bdd44d0721983fc1d9c \
+    --hash=sha256:73a96fd11d2b2e77d623a7f26e004cc31f131a365add1ce1ce9a19e55a1eef90 \
+    --hash=sha256:74bf625b1b4caaa7bad51d9003f8b07a468a704e0644a700e936c357c17dd45a \
+    --hash=sha256:81b1289e99cf4bad07c23393ab447e5e96db0ab50974a280f7954b071d41b489 \
+    --hash=sha256:8425cfce27b8b20c9b89d77fb50e368d8306a90bf2b6eef2cdf5cd5083adf83f \
+    --hash=sha256:875d240fdbdbe9e11f9831901fb8719da0bd4e6131f83aa9f69b96d18fae7504 \
+    --hash=sha256:879591c2b5ab0a7184258274c42a126b74a2c3d5a329df16d69f9cee07bba6ea \
+    --hash=sha256:89fc28495896097622c3fc238915c79365dd0ede02f9a82ce436b13bd0ab7569 \
+    --hash=sha256:8a5e7cc39a45fc430af1aefc4d77ee6bad72c5bcdb1322cfde852c15192b8bd4 \
+    --hash=sha256:8f8909cdb9f1b237786c09a810e24ee5e15ef17019f7cecb207ce205b9b5fcce \
+    --hash=sha256:914f66f3b6fc7b915d46c1cc424bc2441841083de01b90f9e81109c9759e43ab \
+    --hash=sha256:92a3d214d5e53cb1db8b015f30d544bc9d3f7179a05feb8f16df713cecc2620a \
+    --hash=sha256:948a9bd0fb2c5120457b07e59c8d7210cbc8703243225dbd78f4dfc13c8d2d1f \
+    --hash=sha256:9c900108df470060174108012de06d45f514aa4ec21a191e7ab42988ff42a86c \
+    --hash=sha256:9f2939cd4a2a52ca32bc0b359015718472d7f6de870760342e7ba295be9ebaf9 \
+    --hash=sha256:a4192b45dff127c7d69b3bdfb4d3e47b64179a0b9900b6351859f3001397dabf \
+    --hash=sha256:a8fc931382e56627ec4acb01e09ce66e5c03c384ca52606111cee50d931a342d \
+    --hash=sha256:ad47b095f0bdc5585bced35bd088cbfe4177236c7df9984b3cc46b391cc60627 \
+    --hash=sha256:b1ca5f060e205f72bec57faae5bd817a1560fcfc4af03f414b08fa29106b7e2d \
+    --hash=sha256:ba1739fb38441a27a676f4de4123d3e858e494fac05868b7a281c0a383c098f4 \
+    --hash=sha256:baa7ef4e0886a6f482e00d1d5bcd37c201b383f1d314643dfb0367169f94f04c \
+    --hash=sha256:bb90765dd91aed05b53cd7a87bd7f5c188fcd95960914bae0d32c5e7f899719d \
+    --hash=sha256:bc7f729a72b16ee21795a943f85c6244971724819819a41ddbaeb691b2dd85ad \
+    --hash=sha256:bdf62d25234290db1837875d4dceb2151e4ea7f9fff2ed41c0fde23ed542eb5b \
+    --hash=sha256:c30970bdee1cad6a8da2044febd824ef6dc4cc0b19e39af3085c763fdec7de33 \
+    --hash=sha256:d2c63b93548eda58abf5188e505ffed0229bf675f7c3090f8e36ad55b8cbc371 \
+    --hash=sha256:d751300b94e35b6016d4b1e7d0e7bbc3b5e1751e2405ef908316c2a9024008a1 \
+    --hash=sha256:da427d311782324a376cacb47c1a4adc43f99fd9d996ffc1b3e8529c4074d393 \
+    --hash=sha256:daba396199399ccabafbfc509037ac635a6bc18510ad1add8fd16d4739cdd106 \
+    --hash=sha256:e185ec6060e301a7e5f8461c86fb3640a7beb1a0f0208ffde7a65ec4074931df \
+    --hash=sha256:e4a557d97f12813dc5e18dad9fa765ae44ddd56a672bb5de4825527c847d6379 \
+    --hash=sha256:e5ed16d95fd142e9c72b6c10b06514ad30e846a0d0917ab406186541fe68b451 \
+    --hash=sha256:e711fc1acc7468463bc084d1b68561e40d1eaa135d8c509a65dd534403d83d7b \
+    --hash=sha256:f28b29dc158ca5d6ac396c8e0a2ef45c4e97bb7e65522bfc04c989e6fe814575 \
+    --hash=sha256:f335579a1b485c834849e9075191c9898e0731af45705c2ebf70e0cd5d58beed \
+    --hash=sha256:fce6fee67c318fdfb7f285c29a82d84782ae2579c0e1b385b7f36c6e8074fffb \
+    --hash=sha256:fd136bb85f4568fffca995bd3c8d52080b1e5b225dbf1c2b17b66b4c5fa02838
+    # via
+    #   tensorflow-cpu
+    #   tensorflow-datasets
+xxhash==3.5.0 \
+    --hash=sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1 \
+    --hash=sha256:0691bfcc4f9c656bcb96cc5db94b4d75980b9d5589f2e59de790091028580837 \
+    --hash=sha256:07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb \
+    --hash=sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84 \
+    --hash=sha256:0a80ad0ffd78bef9509eee27b4a29e56f5414b87fb01a888353e3d5bda7038bd \
+    --hash=sha256:0adfbd36003d9f86c8c97110039f7539b379f28656a04097e7434d3eaf9aa131 \
+    --hash=sha256:0ec70a89be933ea49222fafc3999987d7899fc676f688dd12252509434636622 \
+    --hash=sha256:1030a39ba01b0c519b1a82f80e8802630d16ab95dc3f2b2386a0b5c8ed5cbb10 \
+    --hash=sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da \
+    --hash=sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166 \
+    --hash=sha256:1328f6d8cca2b86acb14104e381225a3d7b42c92c4b86ceae814e5c400dbb415 \
+    --hash=sha256:13de2b76c1835399b2e419a296d5b38dc4855385d9e96916299170085ef72f57 \
+    --hash=sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00 \
+    --hash=sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d \
+    --hash=sha256:160e0c19ee500482ddfb5d5570a0415f565d8ae2b3fd69c5dcfce8a58107b1c3 \
+    --hash=sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c \
+    --hash=sha256:2061188a1ba352fc699c82bff722f4baacb4b4b8b2f0c745d2001e56d0dfb514 \
+    --hash=sha256:220f3f896c6b8d0316f63f16c077d52c412619e475f9372333474ee15133a558 \
+    --hash=sha256:23241ff6423378a731d84864bf923a41649dc67b144debd1077f02e6249a0d54 \
+    --hash=sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2 \
+    --hash=sha256:297595fe6138d4da2c8ce9e72a04d73e58725bb60f3a19048bc96ab2ff31c692 \
+    --hash=sha256:2b4154c00eb22e4d543f472cfca430e7962a0f1d0f3778334f2e08a7ba59363c \
+    --hash=sha256:2e76e83efc7b443052dd1e585a76201e40b3411fe3da7af4fe434ec51b2f163b \
+    --hash=sha256:30eb2efe6503c379b7ab99c81ba4a779748e3830241f032ab46bd182bf5873af \
+    --hash=sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520 \
+    --hash=sha256:33513d6cc3ed3b559134fb307aae9bdd94d7e7c02907b37896a6c45ff9ce51bd \
+    --hash=sha256:33eac61d0796ca0591f94548dcfe37bb193671e0c9bcf065789b5792f2eda644 \
+    --hash=sha256:37889a0d13b0b7d739cfc128b1c902f04e32de17b33d74b637ad42f1c55101f6 \
+    --hash=sha256:38c384c434021e4f62b8d9ba0bc9467e14d394893077e2c66d826243025e1f81 \
+    --hash=sha256:392f52ebbb932db566973693de48f15ce787cabd15cf6334e855ed22ea0be5b3 \
+    --hash=sha256:3dbbd9892c5ebffeca1ed620cf0ade13eb55a0d8c84e0751a6653adc6ac40d0c \
+    --hash=sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2 \
+    --hash=sha256:3ff2c0a34eae7df88c868be53a8dd56fbdf592109e21d4bfa092a27b0bf4a7bf \
+    --hash=sha256:42eca420c8fa072cc1dd62597635d140e78e384a79bb4944f825fbef8bfeeef6 \
+    --hash=sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b \
+    --hash=sha256:4cc2d67fdb4d057730c75a64c5923abfa17775ae234a71b0200346bfb0a7f482 \
+    --hash=sha256:4e28503dccc7d32e0b9817aa0cbfc1f45f563b2c995b7a66c4c8a0d232e840c7 \
+    --hash=sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6 \
+    --hash=sha256:50ac2184ffb1b999e11e27c7e3e70cc1139047e7ebc1aa95ed12f4269abe98d4 \
+    --hash=sha256:531af8845aaadcadf951b7e0c1345c6b9c68a990eeb74ff9acd8501a0ad6a1c9 \
+    --hash=sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637 \
+    --hash=sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2 \
+    --hash=sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9 \
+    --hash=sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da \
+    --hash=sha256:5d0d307d27099bb0cbeea7260eb39ed4fdb99c5542e21e94bb6fd29e49c57a23 \
+    --hash=sha256:5d2a01dcce81789cf4b12d478b5464632204f4c834dc2d064902ee27d2d1f0ee \
+    --hash=sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b \
+    --hash=sha256:5ed9ebc46f24cf91034544b26b131241b699edbfc99ec5e7f8f3d02d6eb7fba4 \
+    --hash=sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8 \
+    --hash=sha256:602d339548d35a8579c6b013339fb34aee2df9b4e105f985443d2860e4d7ffaa \
+    --hash=sha256:604253b2143e13218ff1ef0b59ce67f18b8bd1c4205d2ffda22b09b426386898 \
+    --hash=sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793 \
+    --hash=sha256:61c722ed8d49ac9bc26c7071eeaa1f6ff24053d553146d5df031802deffd03da \
+    --hash=sha256:63107013578c8a730419adc05608756c3fa640bdc6abe806c3123a49fb829f43 \
+    --hash=sha256:683b94dbd1ca67557850b86423318a2e323511648f9f3f7b1840408a02b9a48c \
+    --hash=sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88 \
+    --hash=sha256:695735deeddfb35da1677dbc16a083445360e37ff46d8ac5c6fcd64917ff9ade \
+    --hash=sha256:6e5f70f6dca1d3b09bccb7daf4e087075ff776e3da9ac870f86ca316736bb4aa \
+    --hash=sha256:6e93a5ad22f434d7876665444a97e713a8f60b5b1a3521e8df11b98309bff833 \
+    --hash=sha256:6fa0b72f2423e2aa53077e54a61c28e181d23effeaafd73fcb9c494e60930c8e \
+    --hash=sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90 \
+    --hash=sha256:74752ecaa544657d88b1d1c94ae68031e364a4d47005a90288f3bab3da3c970f \
+    --hash=sha256:7a46e1d6d2817ba8024de44c4fd79913a90e5f7265434cef97026215b7d30df6 \
+    --hash=sha256:7c5d3e570ef46adaf93fc81b44aca6002b5a4d8ca11bd0580c07eac537f36680 \
+    --hash=sha256:7cb29a034301e2982df8b1fe6328a84f4b676106a13e9135a0d7e0c3e9f806da \
+    --hash=sha256:7ccb800c9418e438b44b060a32adeb8393764da7441eb52aa2aa195448935306 \
+    --hash=sha256:7ce379bcaa9fcc00f19affa7773084dd09f5b59947b3fb47a1ceb0179f91aaa1 \
+    --hash=sha256:7f85e0108d51092bdda90672476c7d909c04ada6923c14ff9d913c4f7dc8a3bc \
+    --hash=sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43 \
+    --hash=sha256:82085c2abec437abebf457c1d12fccb30cc8b3774a0814872511f0f0562c768c \
+    --hash=sha256:82b833d5563fefd6fceafb1aed2f3f3ebe19f84760fdd289f8b926731c2e6e91 \
+    --hash=sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f \
+    --hash=sha256:893074d651cf25c1cc14e3bea4fceefd67f2921b1bb8e40fcfeba56820de80c6 \
+    --hash=sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a \
+    --hash=sha256:89e66ceed67b213dec5a773e2f7a9e8c58f64daeb38c7859d8815d2c89f39ad7 \
+    --hash=sha256:8d47ebd9f5d9607fd039c1fbf4994e3b071ea23eff42f4ecef246ab2b7334198 \
+    --hash=sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623 \
+    --hash=sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839 \
+    --hash=sha256:97a662338797c660178e682f3bc180277b9569a59abfb5925e8620fba00b9fc5 \
+    --hash=sha256:9bed5144c6923cc902cd14bb8963f2d5e034def4486ab0bbe1f58f03f042f9a9 \
+    --hash=sha256:9c770750cc80e8694492244bca7251385188bc5597b6a39d98a9f30e8da984e0 \
+    --hash=sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6 \
+    --hash=sha256:a5bc08f33c4966f4eb6590d6ff3ceae76151ad744576b5fc6c4ba8edd459fdec \
+    --hash=sha256:a606c8070ada8aa2a88e181773fa1ef17ba65ce5dd168b9d08038e2a61b33754 \
+    --hash=sha256:a6c50017518329ed65a9e4829154626f008916d36295b6a3ba336e2458824c8c \
+    --hash=sha256:a7b1d8315d9b5e9f89eb2933b73afae6ec9597a258d52190944437158b49d38e \
+    --hash=sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084 \
+    --hash=sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d \
+    --hash=sha256:a9d360a792cbcce2fe7b66b8d51274ec297c53cbc423401480e53b26161a290d \
+    --hash=sha256:b150b8467852e1bd844387459aa6fbe11d7f38b56e901f9f3b3e6aba0d660240 \
+    --hash=sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58 \
+    --hash=sha256:b96d559e0fcddd3343c510a0fe2b127fbff16bf346dd76280b82292567523442 \
+    --hash=sha256:bcd51708a633410737111e998ceb3b45d3dbc98c0931f743d9bb0a209033a326 \
+    --hash=sha256:bfc8cdd7f33d57f0468b0614ae634cc38ab9202c6957a60e31d285a71ebe0301 \
+    --hash=sha256:c0342aafd421795d740e514bc9858ebddfc705a75a8c5046ac56d85fe97bf196 \
+    --hash=sha256:c279f0d2b34ef15f922b77966640ade58b4ccdfef1c4d94b20f2a364617a493f \
+    --hash=sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7 \
+    --hash=sha256:c3bc7bf8cb8806f8d1c9bf149c18708cb1c406520097d6b0a73977460ea03602 \
+    --hash=sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3 \
+    --hash=sha256:c8aa771ff2c13dd9cda8166d685d7333d389fae30a4d2bb39d63ab5775de8606 \
+    --hash=sha256:cc1276d369452040cbb943300dc8abeedab14245ea44056a2943183822513a18 \
+    --hash=sha256:cd2fd827b0ba763ac919440042302315c564fdb797294d86e8cdd4578e3bc7f3 \
+    --hash=sha256:d30bbc1644f726b825b3278764240f449d75f1a8bdda892e641d4a688b1494ae \
+    --hash=sha256:d5e9db7ef3ecbfc0b4733579cea45713a76852b002cf605420b12ef3ef1ec148 \
+    --hash=sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c \
+    --hash=sha256:dd86b8e7f703ec6ff4f351cfdb9f428955859537125904aa8c963604f2e9d3e7 \
+    --hash=sha256:dee1316133c9b463aa81aca676bc506d3f80d8f65aeb0bba2b78d0b30c51d7bd \
+    --hash=sha256:e0c48b6300cd0b0106bf49169c3e0536408dfbeb1ccb53180068a18b03c662ab \
+    --hash=sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27 \
+    --hash=sha256:e6a4dd644d72ab316b580a1c120b375890e4c52ec392d4aef3c63361ec4d77d1 \
+    --hash=sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab \
+    --hash=sha256:ec28adb204b759306a3d64358a5e5c07d7b1dd0ccbce04aa76cb9377b7b70296 \
+    --hash=sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212 \
+    --hash=sha256:eefc37f6138f522e771ac6db71a6d4838ec7933939676f3753eafd7d3f4c40bc \
+    --hash=sha256:f0b48edbebea1b7421a9c687c304f7b44d0677c46498a046079d445454504737 \
+    --hash=sha256:f1abffa122452481a61c3551ab3c89d72238e279e517705b8b03847b1d93d738 \
+    --hash=sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be \
+    --hash=sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8 \
+    --hash=sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e \
+    --hash=sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e \
+    --hash=sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986 \
+    --hash=sha256:fd1b2281d01723f076df3c8188f43f2472248a6b63118b036e641243656b1b0f \
+    --hash=sha256:fe1a92cfbaa0a1253e339ccec42dbe6db262615e52df591b68726ab10338003f
+    # via datasets
+yarl==1.18.3 \
+    --hash=sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba \
+    --hash=sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193 \
+    --hash=sha256:045b8482ce9483ada4f3f23b3774f4e1bf4f23a2d5c912ed5170f68efb053318 \
+    --hash=sha256:09c7907c8548bcd6ab860e5f513e727c53b4a714f459b084f6580b49fa1b9cee \
+    --hash=sha256:0b0cad37311123211dc91eadcb322ef4d4a66008d3e1bdc404808992260e1a0e \
+    --hash=sha256:0b3c92fa08759dbf12b3a59579a4096ba9af8dd344d9a813fc7f5070d86bbab1 \
+    --hash=sha256:0fb2171a4486bb075316ee754c6d8382ea6eb8b399d4ec62fde2b591f879778a \
+    --hash=sha256:1a74a13a4c857a84a845505fd2d68e54826a2cd01935a96efb1e9d86c728e186 \
+    --hash=sha256:1d407181cfa6e70077df3377938c08012d18893f9f20e92f7d2f314a437c30b1 \
+    --hash=sha256:1dd4bdd05407ced96fed3d7f25dbbf88d2ffb045a0db60dbc247f5b3c5c25d50 \
+    --hash=sha256:25b411eddcfd56a2f0cd6a384e9f4f7aa3efee14b188de13048c25b5e91f1640 \
+    --hash=sha256:2d06d3005e668744e11ed80812e61efd77d70bb7f03e33c1598c301eea20efbb \
+    --hash=sha256:2ec9bbba33b2d00999af4631a3397d1fd78290c48e2a3e52d8dd72db3a067ac8 \
+    --hash=sha256:3236da9272872443f81fedc389bace88408f64f89f75d1bdb2256069a8730ccc \
+    --hash=sha256:35098b24e0327fc4ebdc8ffe336cee0a87a700c24ffed13161af80124b7dc8e5 \
+    --hash=sha256:41f7ce59d6ee7741af71d82020346af364949314ed3d87553763a2df1829cc58 \
+    --hash=sha256:436c4fc0a4d66b2badc6c5fc5ef4e47bb10e4fd9bf0c79524ac719a01f3607c2 \
+    --hash=sha256:4891ed92157e5430874dad17b15eb1fda57627710756c27422200c52d8a4e393 \
+    --hash=sha256:4ac515b860c36becb81bb84b667466885096b5fc85596948548b667da3bf9f24 \
+    --hash=sha256:5094d9206c64181d0f6e76ebd8fb2f8fe274950a63890ee9e0ebfd58bf9d787b \
+    --hash=sha256:54d6921f07555713b9300bee9c50fb46e57e2e639027089b1d795ecd9f7fa910 \
+    --hash=sha256:578e281c393af575879990861823ef19d66e2b1d0098414855dd367e234f5b3c \
+    --hash=sha256:5a3f356548e34a70b0172d8890006c37be92995f62d95a07b4a42e90fba54272 \
+    --hash=sha256:602d98f2c2d929f8e697ed274fbadc09902c4025c5a9963bf4e9edfc3ab6f7ed \
+    --hash=sha256:61b1a825a13bef4a5f10b1885245377d3cd0bf87cba068e1d9a88c2ae36880e1 \
+    --hash=sha256:61e5e68cb65ac8f547f6b5ef933f510134a6bf31bb178be428994b0cb46c2a04 \
+    --hash=sha256:61ee62ead9b68b9123ec24bc866cbef297dd266175d53296e2db5e7f797f902d \
+    --hash=sha256:6333c5a377c8e2f5fae35e7b8f145c617b02c939d04110c76f29ee3676b5f9a5 \
+    --hash=sha256:6748dbf9bfa5ba1afcc7556b71cda0d7ce5f24768043a02a58846e4a443d808d \
+    --hash=sha256:67a283dd2882ac98cc6318384f565bffc751ab564605959df4752d42483ad889 \
+    --hash=sha256:75674776d96d7b851b6498f17824ba17849d790a44d282929c42dbb77d4f17ae \
+    --hash=sha256:757e81cae69244257d125ff31663249b3013b5dc0a8520d73694aed497fb195b \
+    --hash=sha256:77a6e85b90a7641d2e07184df5557132a337f136250caafc9ccaa4a2a998ca2c \
+    --hash=sha256:7c33dd1931a95e5d9a772d0ac5e44cac8957eaf58e3c8da8c1414de7dd27c576 \
+    --hash=sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34 \
+    --hash=sha256:7e2ee16578af3b52ac2f334c3b1f92262f47e02cc6193c598502bd46f5cd1477 \
+    --hash=sha256:80316a8bd5109320d38eef8833ccf5f89608c9107d02d2a7f985f98ed6876990 \
+    --hash=sha256:82123d0c954dc58db301f5021a01854a85bf1f3bb7d12ae0c01afc414a882ca2 \
+    --hash=sha256:84b2deecba4a3f1a398df819151eb72d29bfeb3b69abb145a00ddc8d30094512 \
+    --hash=sha256:8503ad47387b8ebd39cbbbdf0bf113e17330ffd339ba1144074da24c545f0069 \
+    --hash=sha256:877d209b6aebeb5b16c42cbb377f5f94d9e556626b1bfff66d7b0d115be88d0a \
+    --hash=sha256:8874027a53e3aea659a6d62751800cf6e63314c160fd607489ba5c2edd753cf6 \
+    --hash=sha256:88a19f62ff30117e706ebc9090b8ecc79aeb77d0b1f5ec10d2d27a12bc9f66d0 \
+    --hash=sha256:8d39d351e7faf01483cc7ff7c0213c412e38e5a340238826be7e0e4da450fdc8 \
+    --hash=sha256:90adb47ad432332d4f0bc28f83a5963f426ce9a1a8809f5e584e704b82685dcb \
+    --hash=sha256:913829534200eb0f789d45349e55203a091f45c37a2674678744ae52fae23efa \
+    --hash=sha256:93b2e109287f93db79210f86deb6b9bbb81ac32fc97236b16f7433db7fc437d8 \
+    --hash=sha256:9d41beda9dc97ca9ab0b9888cb71f7539124bc05df02c0cff6e5acc5a19dcc6e \
+    --hash=sha256:a440a2a624683108a1b454705ecd7afc1c3438a08e890a1513d468671d90a04e \
+    --hash=sha256:a4bb030cf46a434ec0225bddbebd4b89e6471814ca851abb8696170adb163985 \
+    --hash=sha256:a9ca04806f3be0ac6d558fffc2fdf8fcef767e0489d2684a21912cc4ed0cd1b8 \
+    --hash=sha256:ac1801c45cbf77b6c99242eeff4fffb5e4e73a800b5c4ad4fc0be5def634d2e1 \
+    --hash=sha256:ac36703a585e0929b032fbaab0707b75dc12703766d0b53486eabd5139ebadd5 \
+    --hash=sha256:b1771de9944d875f1b98a745bc547e684b863abf8f8287da8466cf470ef52690 \
+    --hash=sha256:b464c4ab4bfcb41e3bfd3f1c26600d038376c2de3297760dfe064d2cb7ea8e10 \
+    --hash=sha256:b4f6450109834af88cb4cc5ecddfc5380ebb9c228695afc11915a0bf82116789 \
+    --hash=sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b \
+    --hash=sha256:b643562c12680b01e17239be267bc306bbc6aac1f34f6444d1bded0c5ce438ca \
+    --hash=sha256:b958ddd075ddba5b09bb0be8a6d9906d2ce933aee81100db289badbeb966f54e \
+    --hash=sha256:b9d60031cf568c627d028239693fd718025719c02c9f55df0a53e587aab951b5 \
+    --hash=sha256:ba23302c0c61a9999784e73809427c9dbedd79f66a13d84ad1b1943802eaaf59 \
+    --hash=sha256:ba87babd629f8af77f557b61e49e7c7cac36f22f871156b91e10a6e9d4f829e9 \
+    --hash=sha256:c017a3b6df3a1bd45b9fa49a0f54005e53fbcad16633870104b66fa1a30a29d8 \
+    --hash=sha256:c1e1cc06da1491e6734f0ea1e6294ce00792193c463350626571c287c9a704db \
+    --hash=sha256:c654d5207c78e0bd6d749f6dae1dcbbfde3403ad3a4b11f3c5544d9906969dde \
+    --hash=sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7 \
+    --hash=sha256:c7d79f7d9aabd6011004e33b22bc13056a3e3fb54794d138af57f5ee9d9032cb \
+    --hash=sha256:ccaa3a4b521b780a7e771cc336a2dba389a0861592bbce09a476190bb0c8b4b3 \
+    --hash=sha256:ccd17349166b1bee6e529b4add61727d3f55edb7babbe4069b5764c9587a8cc6 \
+    --hash=sha256:ce1af883b94304f493698b00d0f006d56aea98aeb49d75ec7d98cd4a777e9285 \
+    --hash=sha256:d0e883008013c0e4aef84dcfe2a0b172c4d23c2669412cf5b3371003941f72bb \
+    --hash=sha256:d980e0325b6eddc81331d3f4551e2a333999fb176fd153e075c6d1c2530aa8a8 \
+    --hash=sha256:e17c9361d46a4d5addf777c6dd5eab0715a7684c2f11b88c67ac37edfba6c482 \
+    --hash=sha256:e2c08cc9b16f4f4bc522771d96734c7901e7ebef70c6c5c35dd0f10845270bcd \
+    --hash=sha256:e35ef8683211db69ffe129a25d5634319a677570ab6b2eba4afa860f54eeaf75 \
+    --hash=sha256:e3b9fd71836999aad54084906f8663dffcd2a7fb5cdafd6c37713b2e72be1760 \
+    --hash=sha256:ef9f7768395923c3039055c14334ba4d926f3baf7b776c923c93d80195624782 \
+    --hash=sha256:f52a265001d830bc425f82ca9eabda94a64a4d753b07d623a9f2863fde532b53 \
+    --hash=sha256:f91c4803173928a25e1a55b943c81f55b8872f0018be83e3ad4938adffb77dd2 \
+    --hash=sha256:fbd6748e8ab9b41171bb95c6142faf068f5ef1511935a0aa07025438dd9a9bc1 \
+    --hash=sha256:fe57328fbc1bfd0bd0514470ac692630f3901c0ee39052ae47acd1d90a436719 \
+    --hash=sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62
+    # via aiohttp
+zipp==3.21.0 \
+    --hash=sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4 \
+    --hash=sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931
+    # via etils
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==70.3.0 \
+    --hash=sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5 \
+    --hash=sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc
+    # via
+    #   -r requirements-dev.txt
+    #   google-api-core
+    #   tensorboard
+    #   tensorflow-cpu
diff --git a/requirements_lock_3_10.txt b/requirements_lock_3_10.txt
new file mode 100644
index 000000000..bd93e0ff6
--- /dev/null
+++ b/requirements_lock_3_10.txt
@@ -0,0 +1,236 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    bazel run //:requirements_3_10.update
+#
+cachetools==5.5.0 \
+    --hash=sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292 \
+    --hash=sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a
+    # via -r requirements.txt
+cloudpickle==3.1.0 \
+    --hash=sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b \
+    --hash=sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e
+    # via -r requirements.txt
+dill==0.3.9 \
+    --hash=sha256:468dff3b89520b474c0397703366b7b95eebe6303f108adf9b19da1f702be87a \
+    --hash=sha256:81aa267dddf68cbfe8029c42ca9ec6a4ab3b22371d1c450abc54422577b4512c
+    # via multiprocess
+grpcio==1.66.0 \
+    --hash=sha256:0f3010bf46b2a01c9e40644cb9ed91b4b8435e5c500a275da5f9f62580e31e80 \
+    --hash=sha256:1c5466222470cb7fbc9cc898af1d48eefd297cb2e2f59af6d4a851c862fa90ac \
+    --hash=sha256:1eb03524d0f55b965d6c86aa44e5db9e5eaa15f9ed3b164621e652e5b927f4b8 \
+    --hash=sha256:230cdd696751e7eb1395718cd308234749daa217bb8d128f00357dc4df102558 \
+    --hash=sha256:245b08f9b3c645a6a623f3ed4fa43dcfcd6ad701eb9c32511c1bb7380e8c3d23 \
+    --hash=sha256:296a45ea835e12a1cc35ab0c57e455346c272af7b0d178e29c67742167262b4c \
+    --hash=sha256:37514b68a42e9cf24536345d3cf9e580ffd29117c158b4eeea34625200256067 \
+    --hash=sha256:375b58892301a5fc6ca7d7ff689c9dc9d00895f5d560604ace9f4f0573013c63 \
+    --hash=sha256:423ae18637cd99ddcf2e5a6851c61828c49e9b9d022d0442d979b4f230109787 \
+    --hash=sha256:49234580a073ce7ac490112f6c67c874cbcb27804c4525978cdb21ba7f3f193c \
+    --hash=sha256:508411df1f2b7cfa05d4d7dbf3d576fe4f949cd61c03f3a6f0378c84e3d7b963 \
+    --hash=sha256:50cea8ce2552865b87e3dffbb85eb21e6b98d928621600c0feda2f02449cd837 \
+    --hash=sha256:516fdbc8e156db71a004bc431a6303bca24cfde186babe96dde7bd01e8f0cc70 \
+    --hash=sha256:526d4f6ca19f31b25606d5c470ecba55c0b22707b524e4de8987919e8920437d \
+    --hash=sha256:53d4c6706b49e358a2a33345dbe9b6b3bb047cecd7e8c07ba383bd09349bfef8 \
+    --hash=sha256:5b15ef1b296c4e78f15f64fc65bf8081f8774480ffcac45642f69d9d753d9c6b \
+    --hash=sha256:5e8140b39f10d7be2263afa2838112de29374c5c740eb0afd99146cb5bdbd990 \
+    --hash=sha256:5ea27f4ce8c0daccfdd2c7961e6ba404b6599f47c948415c4cca5728739107a3 \
+    --hash=sha256:5f4b3357e59dfba9140a51597287297bc638710d6a163f99ee14efc19967a821 \
+    --hash=sha256:5f93fc84b72bbc7b84a42f3ca9dc055fa00d2303d9803be011ebf7a10a4eb833 \
+    --hash=sha256:643d8d9632a688ae69661e924b862e23c83a3575b24e52917ec5bcc59543d212 \
+    --hash=sha256:684a4c07883cbd4ac864f0d08d927267404f5f0c76f31c85f9bbe05f2daae2f2 \
+    --hash=sha256:6d586a95c05c82a5354be48bb4537e1accaf2472d8eb7e9086d844cbff934482 \
+    --hash=sha256:6ed35bf7da3fb3b1949e32bdf47a8b5ffe0aed11722d948933bd068531cd4682 \
+    --hash=sha256:748452dbd5a047475d5413bdef08b0b9ceb2c0c0e249d4ee905a5fb82c6328dc \
+    --hash=sha256:7bc9d823e05d63a87511fb456dcc48dc0fced86c282bf60229675e7ee7aac1a1 \
+    --hash=sha256:8096a922eb91bc97c839f675c3efa1257c6ef181ae1b25d3fb97f2cae4c57c01 \
+    --hash=sha256:832945e64176520520317b50d64ec7d79924429528d5747669b52d0bf2c7bd78 \
+    --hash=sha256:8fc5c710ddd51b5a0dc36ef1b6663430aa620e0ce029b87b150dafd313b978c3 \
+    --hash=sha256:921b8f7f25d5300d7c6837a1e0639ef145fbdbfb728e0a5db2dbccc9fc0fd891 \
+    --hash=sha256:9d5251578767fe44602688c851c2373b5513048ac84c21a0fe946590a8e7933d \
+    --hash=sha256:a639d3866bfb5a678b5c0b92cd7ab543033ed8988854290fd86145e71731fd4c \
+    --hash=sha256:aaf30c75cbaf30e561ca45f21eb1f729f0fab3f15c592c1074795ed43e3ff96f \
+    --hash=sha256:ad7256f224437b2c29c2bef98ddd3130454c5b1ab1f0471fc11794cefd4dbd3d \
+    --hash=sha256:ba18cfdc09312eb2eea6fa0ce5d2eec3cf345ea78f6528b2eaed6432105e0bd0 \
+    --hash=sha256:ba60ae3b465b3e85080ae3bfbc36fd0305ae495ab16fcf8022fc7d7a23aac846 \
+    --hash=sha256:bc008c6afa1e7c8df99bd9154abc4f0470d26b7730ca2521122e99e771baa8c7 \
+    --hash=sha256:c072f90a1f0409f827ae86266984cba65e89c5831a0726b9fc7f4b5fb940b853 \
+    --hash=sha256:c1ea4c528e7db6660718e4165fd1b5ac24b79a70c870a7bc0b7bdb9babab7c1e \
+    --hash=sha256:c3084e590e857ba7585ae91078e4c9b6ef55aaf1dc343ce26400ba59a146eada \
+    --hash=sha256:c3f6feb0dc8456d025e566709f7dd02885add99bedaac50229013069242a1bfd \
+    --hash=sha256:d0439a970d65327de21c299ea0e0c2ad0987cdaf18ba5066621dea5f427f922b \
+    --hash=sha256:dd614370e939f9fceeeb2915111a0795271b4c11dfb5fc0f58449bee40c726a5 \
+    --hash=sha256:de9e20a0acb709dcfa15a622c91f584f12c9739a79c47999f73435d2b3cc8a3b \
+    --hash=sha256:e36fa838ac1d6c87198ca149cbfcc92e1af06bb8c8cd852622f8e58f33ea3324 \
+    --hash=sha256:e8d20308eeae15b3e182f47876f05acbdec1eebd9473a9814a44e46ec4a84c04
+    # via -r requirements.txt
+jax[cpu]==0.4.34 \
+    --hash=sha256:44196854f40c5f9cea3142824b9f1051f85afc3fcf7593ec5479fc8db01c58db \
+    --hash=sha256:b957ca1fc91f7343f91a186af9f19c7f342c946f95a8c11c7f1e5cdfe2e58d9e
+    # via -r requirements.txt
+jaxlib==0.4.34 \
+    --hash=sha256:096f0ca309d41fa692a9d1f2f9baab1c5c8ca0749876ebb3f748e738a27c7ff4 \
+    --hash=sha256:133070d4fec5525ffea4dc72956398c1cf647a04dcb37f8a935ee82af78d9965 \
+    --hash=sha256:1a30771d85fa77f9ab8f18e63240f455ab3a3f87660ed7b8d5eea6ceecbe5c1e \
+    --hash=sha256:3bcfa639ca3cfaf86c8ceebd5fc0d47300fd98a078014a1d0cc03133e1523d5f \
+    --hash=sha256:3e60bc826933082e99b19b87c21818a8d26fcdb01f418d47cedff554746fd6cc \
+    --hash=sha256:45d719a2ce0ebf21255a277b71d756f3609b7b5be70cddc5d88fd58c35219de0 \
+    --hash=sha256:48272e9034ff868d4328cf0055a07882fd2be93f59dfb6283af7de491f9d1290 \
+    --hash=sha256:571ef03259835458111596a71a2f4a6fabf4ec34595df4cea555035362ac5bf0 \
+    --hash=sha256:6b43a974c5d91a19912d138f2658dd8dbb7d30dcdff5c961d896c673e872b611 \
+    --hash=sha256:72e22e99a5dc890a64443c3fc12f13f20091f578c405a76de077ba42b4c62cd7 \
+    --hash=sha256:7be673a876ebd1aef440fb7e3ebaf99a91abeb550c9728c644b7d7c7b5d7c108 \
+    --hash=sha256:87f25a477cd279840e53718403f97092eba0e8a945fcab47bcf435b6f9119dda \
+    --hash=sha256:8ee3f93836e53c86556ccd9449a4ea43516ee05184d031a71dd692e81259f7d9 \
+    --hash=sha256:901cb4040ed24eae40071d8114ea8d10dff436277fa74a1a5b9e7206f641151c \
+    --hash=sha256:b0001c8f0e2b1c7bc99e4f314b524a340d25653505c1a1484d4041a9d3617f6f \
+    --hash=sha256:b7a212a3cb5c6acc201c32ae4f4b5f5a9ac09457fbb77ba8db5ce7e7d4adc214 \
+    --hash=sha256:c303f5acaf6c56ce5ff133a923c9b6247bdebedde15bd2c893c24be4d8f71306 \
+    --hash=sha256:c7b3e724a30426a856070aba0192b5d199e95b4411070e7ad96ad8b196877b10 \
+    --hash=sha256:c9d3adcae43a33aad4332be9c2aedc5ef751d1e755f917a5afb30c7872eacaa8 \
+    --hash=sha256:d840e64b85f8865404d6d225b9bb340e158df1457152a361b05680e24792b232
+    # via jax
+ml-dtypes==0.5.0 \
+    --hash=sha256:099e09edd54e676903b4538f3815b5ab96f5b119690514602d96bfdb67172cbe \
+    --hash=sha256:2e7534392682c3098bc7341648c650864207169c654aed83143d7a19c67ae06f \
+    --hash=sha256:3e7d3a380fe73a63c884f06136f8baa7a5249cc8e9fdec677997dd78549f8128 \
+    --hash=sha256:54415257f00eb44fbcc807454efac3356f75644f1cbfc2d4e5522a72ae1dacab \
+    --hash=sha256:5f2b59233a0dbb6a560b3137ed6125433289ccba2f8d9c3695a52423a369ed15 \
+    --hash=sha256:60275f2b51b56834e840c4809fca840565f9bf8e9a73f6d8c94f5b5935701215 \
+    --hash=sha256:76942f6aeb5c40766d5ea62386daa4148e6a54322aaf5b53eae9e7553240222f \
+    --hash=sha256:7ee9c320bb0f9ffdf9f6fa6a696ef2e005d1f66438d6f1c1457338e00a02e8cf \
+    --hash=sha256:8c32138975797e681eb175996d64356bcfa124bdbb6a70460b9768c2b35a6fa4 \
+    --hash=sha256:968fede07d1f9b926a63df97d25ac656cac1a57ebd33701734eaf704bc55d8d8 \
+    --hash=sha256:a03fc861b86cc586728e3d093ba37f0cc05e65330c3ebd7688e7bae8290f8859 \
+    --hash=sha256:a38df8df61194aeaae1ab7579075779b4ad32cd1cffd012c28be227fa7f2a70a \
+    --hash=sha256:a988bac6572630e1e9c2edd9b1277b4eefd1c86209e52b0d061b775ac33902ff \
+    --hash=sha256:ab046f2ff789b1f11b2491909682c5d089934835f9a760fafc180e47dcb676b8 \
+    --hash=sha256:afa08343069874a30812871d639f9c02b4158ace065601406a493a8511180c02 \
+    --hash=sha256:c7a9152f5876fef565516aa5dd1dccd6fc298a5891b2467973905103eb5c7856 \
+    --hash=sha256:cb5cc7b25acabd384f75bbd78892d0c724943f3e2e1986254665a1aa10982e07 \
+    --hash=sha256:d3b3db9990c3840986a0e70524e122cfa32b91139c3653df76121ba7776e015f \
+    --hash=sha256:d4b1a70a3e5219790d6b55b9507606fc4e02911d1497d16c18dd721eb7efe7d0 \
+    --hash=sha256:dc74fd9995513d33eac63d64e436240f5494ec74d522a9f0920194942fc3d2d7 \
+    --hash=sha256:e04fde367b2fe901b1d47234426fe8819909bd1dd862a5adb630f27789c20599
+    # via
+    #   jax
+    #   jaxlib
+multiprocess==0.70.17 \
+    --hash=sha256:1d52f068357acd1e5bbc670b273ef8f81d57863235d9fbf9314751886e141968 \
+    --hash=sha256:20c28ca19079a6c879258103a6d60b94d4ffe2d9da07dda93fb1c8bc6243f522 \
+    --hash=sha256:27b8409c02b5dd89d336107c101dfbd1530a2cd4fd425fc27dcb7adb6e0b47bf \
+    --hash=sha256:2818af14c52446b9617d1b0755fa70ca2f77c28b25ed97bdaa2c69a22c47b46c \
+    --hash=sha256:2884701445d0177aec5bd5f6ee0df296773e4fb65b11903b94c613fb46cfb7d1 \
+    --hash=sha256:2b12e081df87ab755190e227341b2c3b17ee6587e9c82fecddcbe6aa812cd7f7 \
+    --hash=sha256:2ea0939b0f4760a16a548942c65c76ff5afd81fbf1083c56ae75e21faf92e426 \
+    --hash=sha256:349525099a0c9ac5936f0488b5ee73199098dac3ac899d81d326d238f9fd3ccd \
+    --hash=sha256:38357ca266b51a2e22841b755d9a91e4bb7b937979a54d411677111716c32744 \
+    --hash=sha256:4ae2f11a3416809ebc9a48abfc8b14ecce0652a0944731a1493a3c1ba44ff57a \
+    --hash=sha256:7ddb24e5bcdb64e90ec5543a1f05a39463068b6d3b804aa3f2a4e16ec28562d6 \
+    --hash=sha256:a0f01cd9d079af7a8296f521dc03859d1a414d14c1e2b6e676ef789333421c95 \
+    --hash=sha256:a22a6b1a482b80eab53078418bb0f7025e4f7d93cc8e1f36481477a023884861 \
+    --hash=sha256:c2c82d0375baed8d8dd0d8c38eb87c5ae9c471f8e384ad203a36f095ee860f67 \
+    --hash=sha256:c3feb874ba574fbccfb335980020c1ac631fbf2a3f7bee4e2042ede62558a021 \
+    --hash=sha256:d729f55198a3579f6879766a6d9b72b42d4b320c0dcb7844afb774d75b573c62
+    # via -r requirements.txt
+numpy==1.26.4 \
+    --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
+    --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
+    --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \
+    --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \
+    --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \
+    --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \
+    --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \
+    --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \
+    --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \
+    --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \
+    --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \
+    --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \
+    --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \
+    --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \
+    --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \
+    --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \
+    --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \
+    --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \
+    --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \
+    --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \
+    --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \
+    --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \
+    --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \
+    --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \
+    --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \
+    --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \
+    --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \
+    --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \
+    --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \
+    --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \
+    --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \
+    --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \
+    --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \
+    --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \
+    --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \
+    --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f
+    # via
+    #   -r requirements.txt
+    #   jax
+    #   jaxlib
+    #   ml-dtypes
+    #   scipy
+opt-einsum==3.4.0 \
+    --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \
+    --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac
+    # via jax
+protobuf==5.27.3 \
+    --hash=sha256:043853dcb55cc262bf2e116215ad43fa0859caab79bb0b2d31b708f128ece035 \
+    --hash=sha256:16ddf3f8c6c41e1e803da7abea17b1793a97ef079a912e42351eabb19b2cffe7 \
+    --hash=sha256:68248c60d53f6168f565a8c76dc58ba4fa2ade31c2d1ebdae6d80f969cdc2d4f \
+    --hash=sha256:82460903e640f2b7e34ee81a947fdaad89de796d324bcbc38ff5430bcdead82c \
+    --hash=sha256:8572c6533e544ebf6899c360e91d6bcbbee2549251643d32c52cf8a5de295ba5 \
+    --hash=sha256:a55c48f2a2092d8e213bd143474df33a6ae751b781dd1d1f4d953c128a415b25 \
+    --hash=sha256:af7c0b7cfbbb649ad26132e53faa348580f844d9ca46fd3ec7ca48a1ea5db8a1 \
+    --hash=sha256:b8a994fb3d1c11156e7d1e427186662b64694a62b55936b2b9348f0a7c6625ce \
+    --hash=sha256:c2a105c24f08b1e53d6c7ffe69cb09d0031512f0b72f812dd4005b8112dbe91e \
+    --hash=sha256:c84eee2c71ed83704f1afbf1a85c3171eab0fd1ade3b399b3fad0884cbcca8bf \
+    --hash=sha256:dcb307cd4ef8fec0cf52cb9105a03d06fbb5275ce6d84a6ae33bc6cf84e0a07b
+    # via -r requirements.txt
+scipy==1.14.1 \
+    --hash=sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e \
+    --hash=sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79 \
+    --hash=sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37 \
+    --hash=sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5 \
+    --hash=sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 \
+    --hash=sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d \
+    --hash=sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f \
+    --hash=sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310 \
+    --hash=sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617 \
+    --hash=sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e \
+    --hash=sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e \
+    --hash=sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417 \
+    --hash=sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d \
+    --hash=sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 \
+    --hash=sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad \
+    --hash=sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8 \
+    --hash=sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0 \
+    --hash=sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69 \
+    --hash=sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 \
+    --hash=sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3 \
+    --hash=sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5 \
+    --hash=sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 \
+    --hash=sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2 \
+    --hash=sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389 \
+    --hash=sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d \
+    --hash=sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84 \
+    --hash=sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 \
+    --hash=sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3 \
+    --hash=sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73 \
+    --hash=sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06 \
+    --hash=sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc \
+    --hash=sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1 \
+    --hash=sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2
+    # via
+    #   jax
+    #   jaxlib
+termcolor==2.5.0 \
+    --hash=sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8 \
+    --hash=sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f
+    # via -r requirements.txt
diff --git a/requirements_lock_3_11.txt b/requirements_lock_3_11.txt
new file mode 100644
index 000000000..26200fd67
--- /dev/null
+++ b/requirements_lock_3_11.txt
@@ -0,0 +1,236 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    bazel run //:requirements_3_11.update
+#
+cachetools==5.5.0 \
+    --hash=sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292 \
+    --hash=sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a
+    # via -r requirements.txt
+cloudpickle==3.1.0 \
+    --hash=sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b \
+    --hash=sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e
+    # via -r requirements.txt
+dill==0.3.9 \
+    --hash=sha256:468dff3b89520b474c0397703366b7b95eebe6303f108adf9b19da1f702be87a \
+    --hash=sha256:81aa267dddf68cbfe8029c42ca9ec6a4ab3b22371d1c450abc54422577b4512c
+    # via multiprocess
+grpcio==1.66.0 \
+    --hash=sha256:0f3010bf46b2a01c9e40644cb9ed91b4b8435e5c500a275da5f9f62580e31e80 \
+    --hash=sha256:1c5466222470cb7fbc9cc898af1d48eefd297cb2e2f59af6d4a851c862fa90ac \
+    --hash=sha256:1eb03524d0f55b965d6c86aa44e5db9e5eaa15f9ed3b164621e652e5b927f4b8 \
+    --hash=sha256:230cdd696751e7eb1395718cd308234749daa217bb8d128f00357dc4df102558 \
+    --hash=sha256:245b08f9b3c645a6a623f3ed4fa43dcfcd6ad701eb9c32511c1bb7380e8c3d23 \
+    --hash=sha256:296a45ea835e12a1cc35ab0c57e455346c272af7b0d178e29c67742167262b4c \
+    --hash=sha256:37514b68a42e9cf24536345d3cf9e580ffd29117c158b4eeea34625200256067 \
+    --hash=sha256:375b58892301a5fc6ca7d7ff689c9dc9d00895f5d560604ace9f4f0573013c63 \
+    --hash=sha256:423ae18637cd99ddcf2e5a6851c61828c49e9b9d022d0442d979b4f230109787 \
+    --hash=sha256:49234580a073ce7ac490112f6c67c874cbcb27804c4525978cdb21ba7f3f193c \
+    --hash=sha256:508411df1f2b7cfa05d4d7dbf3d576fe4f949cd61c03f3a6f0378c84e3d7b963 \
+    --hash=sha256:50cea8ce2552865b87e3dffbb85eb21e6b98d928621600c0feda2f02449cd837 \
+    --hash=sha256:516fdbc8e156db71a004bc431a6303bca24cfde186babe96dde7bd01e8f0cc70 \
+    --hash=sha256:526d4f6ca19f31b25606d5c470ecba55c0b22707b524e4de8987919e8920437d \
+    --hash=sha256:53d4c6706b49e358a2a33345dbe9b6b3bb047cecd7e8c07ba383bd09349bfef8 \
+    --hash=sha256:5b15ef1b296c4e78f15f64fc65bf8081f8774480ffcac45642f69d9d753d9c6b \
+    --hash=sha256:5e8140b39f10d7be2263afa2838112de29374c5c740eb0afd99146cb5bdbd990 \
+    --hash=sha256:5ea27f4ce8c0daccfdd2c7961e6ba404b6599f47c948415c4cca5728739107a3 \
+    --hash=sha256:5f4b3357e59dfba9140a51597287297bc638710d6a163f99ee14efc19967a821 \
+    --hash=sha256:5f93fc84b72bbc7b84a42f3ca9dc055fa00d2303d9803be011ebf7a10a4eb833 \
+    --hash=sha256:643d8d9632a688ae69661e924b862e23c83a3575b24e52917ec5bcc59543d212 \
+    --hash=sha256:684a4c07883cbd4ac864f0d08d927267404f5f0c76f31c85f9bbe05f2daae2f2 \
+    --hash=sha256:6d586a95c05c82a5354be48bb4537e1accaf2472d8eb7e9086d844cbff934482 \
+    --hash=sha256:6ed35bf7da3fb3b1949e32bdf47a8b5ffe0aed11722d948933bd068531cd4682 \
+    --hash=sha256:748452dbd5a047475d5413bdef08b0b9ceb2c0c0e249d4ee905a5fb82c6328dc \
+    --hash=sha256:7bc9d823e05d63a87511fb456dcc48dc0fced86c282bf60229675e7ee7aac1a1 \
+    --hash=sha256:8096a922eb91bc97c839f675c3efa1257c6ef181ae1b25d3fb97f2cae4c57c01 \
+    --hash=sha256:832945e64176520520317b50d64ec7d79924429528d5747669b52d0bf2c7bd78 \
+    --hash=sha256:8fc5c710ddd51b5a0dc36ef1b6663430aa620e0ce029b87b150dafd313b978c3 \
+    --hash=sha256:921b8f7f25d5300d7c6837a1e0639ef145fbdbfb728e0a5db2dbccc9fc0fd891 \
+    --hash=sha256:9d5251578767fe44602688c851c2373b5513048ac84c21a0fe946590a8e7933d \
+    --hash=sha256:a639d3866bfb5a678b5c0b92cd7ab543033ed8988854290fd86145e71731fd4c \
+    --hash=sha256:aaf30c75cbaf30e561ca45f21eb1f729f0fab3f15c592c1074795ed43e3ff96f \
+    --hash=sha256:ad7256f224437b2c29c2bef98ddd3130454c5b1ab1f0471fc11794cefd4dbd3d \
+    --hash=sha256:ba18cfdc09312eb2eea6fa0ce5d2eec3cf345ea78f6528b2eaed6432105e0bd0 \
+    --hash=sha256:ba60ae3b465b3e85080ae3bfbc36fd0305ae495ab16fcf8022fc7d7a23aac846 \
+    --hash=sha256:bc008c6afa1e7c8df99bd9154abc4f0470d26b7730ca2521122e99e771baa8c7 \
+    --hash=sha256:c072f90a1f0409f827ae86266984cba65e89c5831a0726b9fc7f4b5fb940b853 \
+    --hash=sha256:c1ea4c528e7db6660718e4165fd1b5ac24b79a70c870a7bc0b7bdb9babab7c1e \
+    --hash=sha256:c3084e590e857ba7585ae91078e4c9b6ef55aaf1dc343ce26400ba59a146eada \
+    --hash=sha256:c3f6feb0dc8456d025e566709f7dd02885add99bedaac50229013069242a1bfd \
+    --hash=sha256:d0439a970d65327de21c299ea0e0c2ad0987cdaf18ba5066621dea5f427f922b \
+    --hash=sha256:dd614370e939f9fceeeb2915111a0795271b4c11dfb5fc0f58449bee40c726a5 \
+    --hash=sha256:de9e20a0acb709dcfa15a622c91f584f12c9739a79c47999f73435d2b3cc8a3b \
+    --hash=sha256:e36fa838ac1d6c87198ca149cbfcc92e1af06bb8c8cd852622f8e58f33ea3324 \
+    --hash=sha256:e8d20308eeae15b3e182f47876f05acbdec1eebd9473a9814a44e46ec4a84c04
+    # via -r requirements.txt
+jax[cpu]==0.4.34 \
+    --hash=sha256:44196854f40c5f9cea3142824b9f1051f85afc3fcf7593ec5479fc8db01c58db \
+    --hash=sha256:b957ca1fc91f7343f91a186af9f19c7f342c946f95a8c11c7f1e5cdfe2e58d9e
+    # via -r requirements.txt
+jaxlib==0.4.34 \
+    --hash=sha256:096f0ca309d41fa692a9d1f2f9baab1c5c8ca0749876ebb3f748e738a27c7ff4 \
+    --hash=sha256:133070d4fec5525ffea4dc72956398c1cf647a04dcb37f8a935ee82af78d9965 \
+    --hash=sha256:1a30771d85fa77f9ab8f18e63240f455ab3a3f87660ed7b8d5eea6ceecbe5c1e \
+    --hash=sha256:3bcfa639ca3cfaf86c8ceebd5fc0d47300fd98a078014a1d0cc03133e1523d5f \
+    --hash=sha256:3e60bc826933082e99b19b87c21818a8d26fcdb01f418d47cedff554746fd6cc \
+    --hash=sha256:45d719a2ce0ebf21255a277b71d756f3609b7b5be70cddc5d88fd58c35219de0 \
+    --hash=sha256:48272e9034ff868d4328cf0055a07882fd2be93f59dfb6283af7de491f9d1290 \
+    --hash=sha256:571ef03259835458111596a71a2f4a6fabf4ec34595df4cea555035362ac5bf0 \
+    --hash=sha256:6b43a974c5d91a19912d138f2658dd8dbb7d30dcdff5c961d896c673e872b611 \
+    --hash=sha256:72e22e99a5dc890a64443c3fc12f13f20091f578c405a76de077ba42b4c62cd7 \
+    --hash=sha256:7be673a876ebd1aef440fb7e3ebaf99a91abeb550c9728c644b7d7c7b5d7c108 \
+    --hash=sha256:87f25a477cd279840e53718403f97092eba0e8a945fcab47bcf435b6f9119dda \
+    --hash=sha256:8ee3f93836e53c86556ccd9449a4ea43516ee05184d031a71dd692e81259f7d9 \
+    --hash=sha256:901cb4040ed24eae40071d8114ea8d10dff436277fa74a1a5b9e7206f641151c \
+    --hash=sha256:b0001c8f0e2b1c7bc99e4f314b524a340d25653505c1a1484d4041a9d3617f6f \
+    --hash=sha256:b7a212a3cb5c6acc201c32ae4f4b5f5a9ac09457fbb77ba8db5ce7e7d4adc214 \
+    --hash=sha256:c303f5acaf6c56ce5ff133a923c9b6247bdebedde15bd2c893c24be4d8f71306 \
+    --hash=sha256:c7b3e724a30426a856070aba0192b5d199e95b4411070e7ad96ad8b196877b10 \
+    --hash=sha256:c9d3adcae43a33aad4332be9c2aedc5ef751d1e755f917a5afb30c7872eacaa8 \
+    --hash=sha256:d840e64b85f8865404d6d225b9bb340e158df1457152a361b05680e24792b232
+    # via jax
+ml-dtypes==0.5.0 \
+    --hash=sha256:099e09edd54e676903b4538f3815b5ab96f5b119690514602d96bfdb67172cbe \
+    --hash=sha256:2e7534392682c3098bc7341648c650864207169c654aed83143d7a19c67ae06f \
+    --hash=sha256:3e7d3a380fe73a63c884f06136f8baa7a5249cc8e9fdec677997dd78549f8128 \
+    --hash=sha256:54415257f00eb44fbcc807454efac3356f75644f1cbfc2d4e5522a72ae1dacab \
+    --hash=sha256:5f2b59233a0dbb6a560b3137ed6125433289ccba2f8d9c3695a52423a369ed15 \
+    --hash=sha256:60275f2b51b56834e840c4809fca840565f9bf8e9a73f6d8c94f5b5935701215 \
+    --hash=sha256:76942f6aeb5c40766d5ea62386daa4148e6a54322aaf5b53eae9e7553240222f \
+    --hash=sha256:7ee9c320bb0f9ffdf9f6fa6a696ef2e005d1f66438d6f1c1457338e00a02e8cf \
+    --hash=sha256:8c32138975797e681eb175996d64356bcfa124bdbb6a70460b9768c2b35a6fa4 \
+    --hash=sha256:968fede07d1f9b926a63df97d25ac656cac1a57ebd33701734eaf704bc55d8d8 \
+    --hash=sha256:a03fc861b86cc586728e3d093ba37f0cc05e65330c3ebd7688e7bae8290f8859 \
+    --hash=sha256:a38df8df61194aeaae1ab7579075779b4ad32cd1cffd012c28be227fa7f2a70a \
+    --hash=sha256:a988bac6572630e1e9c2edd9b1277b4eefd1c86209e52b0d061b775ac33902ff \
+    --hash=sha256:ab046f2ff789b1f11b2491909682c5d089934835f9a760fafc180e47dcb676b8 \
+    --hash=sha256:afa08343069874a30812871d639f9c02b4158ace065601406a493a8511180c02 \
+    --hash=sha256:c7a9152f5876fef565516aa5dd1dccd6fc298a5891b2467973905103eb5c7856 \
+    --hash=sha256:cb5cc7b25acabd384f75bbd78892d0c724943f3e2e1986254665a1aa10982e07 \
+    --hash=sha256:d3b3db9990c3840986a0e70524e122cfa32b91139c3653df76121ba7776e015f \
+    --hash=sha256:d4b1a70a3e5219790d6b55b9507606fc4e02911d1497d16c18dd721eb7efe7d0 \
+    --hash=sha256:dc74fd9995513d33eac63d64e436240f5494ec74d522a9f0920194942fc3d2d7 \
+    --hash=sha256:e04fde367b2fe901b1d47234426fe8819909bd1dd862a5adb630f27789c20599
+    # via
+    #   jax
+    #   jaxlib
+multiprocess==0.70.17 \
+    --hash=sha256:1d52f068357acd1e5bbc670b273ef8f81d57863235d9fbf9314751886e141968 \
+    --hash=sha256:20c28ca19079a6c879258103a6d60b94d4ffe2d9da07dda93fb1c8bc6243f522 \
+    --hash=sha256:27b8409c02b5dd89d336107c101dfbd1530a2cd4fd425fc27dcb7adb6e0b47bf \
+    --hash=sha256:2818af14c52446b9617d1b0755fa70ca2f77c28b25ed97bdaa2c69a22c47b46c \
+    --hash=sha256:2884701445d0177aec5bd5f6ee0df296773e4fb65b11903b94c613fb46cfb7d1 \
+    --hash=sha256:2b12e081df87ab755190e227341b2c3b17ee6587e9c82fecddcbe6aa812cd7f7 \
+    --hash=sha256:2ea0939b0f4760a16a548942c65c76ff5afd81fbf1083c56ae75e21faf92e426 \
+    --hash=sha256:349525099a0c9ac5936f0488b5ee73199098dac3ac899d81d326d238f9fd3ccd \
+    --hash=sha256:38357ca266b51a2e22841b755d9a91e4bb7b937979a54d411677111716c32744 \
+    --hash=sha256:4ae2f11a3416809ebc9a48abfc8b14ecce0652a0944731a1493a3c1ba44ff57a \
+    --hash=sha256:7ddb24e5bcdb64e90ec5543a1f05a39463068b6d3b804aa3f2a4e16ec28562d6 \
+    --hash=sha256:a0f01cd9d079af7a8296f521dc03859d1a414d14c1e2b6e676ef789333421c95 \
+    --hash=sha256:a22a6b1a482b80eab53078418bb0f7025e4f7d93cc8e1f36481477a023884861 \
+    --hash=sha256:c2c82d0375baed8d8dd0d8c38eb87c5ae9c471f8e384ad203a36f095ee860f67 \
+    --hash=sha256:c3feb874ba574fbccfb335980020c1ac631fbf2a3f7bee4e2042ede62558a021 \
+    --hash=sha256:d729f55198a3579f6879766a6d9b72b42d4b320c0dcb7844afb774d75b573c62
+    # via -r requirements.txt
+numpy==1.26.4 \
+    --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
+    --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
+    --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \
+    --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \
+    --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \
+    --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \
+    --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \
+    --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \
+    --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \
+    --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \
+    --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \
+    --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \
+    --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \
+    --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \
+    --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \
+    --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \
+    --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \
+    --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \
+    --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \
+    --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \
+    --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \
+    --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \
+    --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \
+    --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \
+    --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \
+    --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \
+    --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \
+    --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \
+    --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \
+    --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \
+    --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \
+    --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \
+    --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \
+    --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \
+    --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \
+    --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f
+    # via
+    #   -r requirements.txt
+    #   jax
+    #   jaxlib
+    #   ml-dtypes
+    #   scipy
+opt-einsum==3.4.0 \
+    --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \
+    --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac
+    # via jax
+protobuf==5.27.3 \
+    --hash=sha256:043853dcb55cc262bf2e116215ad43fa0859caab79bb0b2d31b708f128ece035 \
+    --hash=sha256:16ddf3f8c6c41e1e803da7abea17b1793a97ef079a912e42351eabb19b2cffe7 \
+    --hash=sha256:68248c60d53f6168f565a8c76dc58ba4fa2ade31c2d1ebdae6d80f969cdc2d4f \
+    --hash=sha256:82460903e640f2b7e34ee81a947fdaad89de796d324bcbc38ff5430bcdead82c \
+    --hash=sha256:8572c6533e544ebf6899c360e91d6bcbbee2549251643d32c52cf8a5de295ba5 \
+    --hash=sha256:a55c48f2a2092d8e213bd143474df33a6ae751b781dd1d1f4d953c128a415b25 \
+    --hash=sha256:af7c0b7cfbbb649ad26132e53faa348580f844d9ca46fd3ec7ca48a1ea5db8a1 \
+    --hash=sha256:b8a994fb3d1c11156e7d1e427186662b64694a62b55936b2b9348f0a7c6625ce \
+    --hash=sha256:c2a105c24f08b1e53d6c7ffe69cb09d0031512f0b72f812dd4005b8112dbe91e \
+    --hash=sha256:c84eee2c71ed83704f1afbf1a85c3171eab0fd1ade3b399b3fad0884cbcca8bf \
+    --hash=sha256:dcb307cd4ef8fec0cf52cb9105a03d06fbb5275ce6d84a6ae33bc6cf84e0a07b
+    # via -r requirements.txt
+scipy==1.14.1 \
+    --hash=sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e \
+    --hash=sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79 \
+    --hash=sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37 \
+    --hash=sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5 \
+    --hash=sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 \
+    --hash=sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d \
+    --hash=sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f \
+    --hash=sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310 \
+    --hash=sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617 \
+    --hash=sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e \
+    --hash=sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e \
+    --hash=sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417 \
+    --hash=sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d \
+    --hash=sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 \
+    --hash=sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad \
+    --hash=sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8 \
+    --hash=sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0 \
+    --hash=sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69 \
+    --hash=sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 \
+    --hash=sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3 \
+    --hash=sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5 \
+    --hash=sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 \
+    --hash=sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2 \
+    --hash=sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389 \
+    --hash=sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d \
+    --hash=sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84 \
+    --hash=sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 \
+    --hash=sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3 \
+    --hash=sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73 \
+    --hash=sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06 \
+    --hash=sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc \
+    --hash=sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1 \
+    --hash=sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2
+    # via
+    #   jax
+    #   jaxlib
+termcolor==2.5.0 \
+    --hash=sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8 \
+    --hash=sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f
+    # via -r requirements.txt
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 04e47948b..000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,9 +0,0 @@
-[pep8]
-max-line-length = 80
-
-[pycodestyle]
-max-line-length = 80
-
-[yapf]
-based_on_style = pep8
-column_limit = 80
diff --git a/setup.py b/setup.py
deleted file mode 100644
index cb1cac998..000000000
--- a/setup.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright 2023 Ant Group Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Ideas borrowed from: https://github.com/ray-project/ray/blob/master/python/setup.py
-
-import io
-import logging
-import os
-import platform
-import re
-import shutil
-import subprocess
-import sys
-from datetime import datetime, timedelta
-
-import setuptools
-import setuptools.command.build_ext
-
-logger = logging.getLogger(__name__)
-
-# 3.8 is the minimum python version we can support
-SUPPORTED_PYTHONS = [(3, 9), (3, 10), (3, 11)]
-
-BAZEL_MAX_JOBS = os.getenv("BAZEL_MAX_JOBS")
-ROOT_DIR = os.path.dirname(__file__)
-SKIP_BAZEL_CLEAN = os.getenv("SKIP_BAZEL_CLEAN")
-ENABLE_GPU_BUILD = os.getenv("ENABLE_GPU_BUILD")
-
-pyd_suffix = ".so"
-
-
-def add_date_to_version(*filepath):
-    local_time = datetime.utcnow()
-    chn_time = local_time + timedelta(hours=8)
-    dstr = chn_time.strftime("%Y%m%d")
-    with open(os.path.join(ROOT_DIR, *filepath), "r") as fp:
-        content = fp.read()
-
-    content = content.replace("$$DATE$$", dstr)
-
-    with open(os.path.join(ROOT_DIR, *filepath), "w+") as fp:
-        fp.write(content)
-
-
-def find_version(*filepath):
-    add_date_to_version(*filepath)
-    # Extract version information from filepath
-    with open(os.path.join(ROOT_DIR, *filepath)) as fp:
-        version_match = re.search(
-            r"^#define SPU_VERSION ['\"]([^'\"]*)['\"]", fp.read(), re.M
-        )
-        if version_match:
-            return version_match.group(1)
-        raise RuntimeError("Unable to find version string.")
-
-
-def read_requirements(*filepath):
-    requirements = []
-    with open(os.path.join(ROOT_DIR, *filepath)) as file:
-        requirements = file.read().splitlines()
-    return requirements
-
-
-class SetupSpec:
-    def __init__(self, name: str, description: str):
-        self.name: str = name
-        self.version = find_version("libspu", "version.h")
-        self.description: str = description
-        self.files_to_include: list = []
-        self.install_requires: list = []
-        self.extras: dict = {}
-
-    def get_packages(self):
-        return setuptools.find_packages()
-
-
-setup_spec = SetupSpec(
-    "spu",
-    "SPU aims to be a 'provable', 'measurable' secure computation device.",
-)
-
-# Ideally, we could include these files by putting them in a
-# MANIFEST.in or using the package_data argument to setup, but the
-# MANIFEST.in gets applied at the very beginning when setup.py runs
-# before these files have been created, so we have to move the files
-# manually.
-
-# NOTE: The lists below must be kept in sync with spu/BUILD.bazel.
-spu_lib_files = [
-    "bazel-bin/spu/libspu" + pyd_suffix,
-    "bazel-bin/spu/libpsi" + pyd_suffix,
-]
-
-# These are the directories where automatically generated Python protobuf
-# bindings are created.
-generated_python_directories = [
-    "bazel-bin/spu",
-    "bazel-bin/libspu",
-    "bazel-bin/spu/utils",
-]
-
-setup_spec.install_requires = read_requirements('requirements.txt')
-
-files_to_remove = [
-    "spu/intrinsic/add_new_intrinsic.py",
-]
-
-
-# Calls Bazel in PATH
-def bazel_invoke(invoker, cmdline, *args, **kwargs):
-    try:
-        result = invoker(['bazel'] + cmdline, *args, **kwargs)
-        return result
-    except IOError:
-        raise
-
-
-def build(build_python, build_cpp):
-    if tuple(sys.version_info[:2]) not in SUPPORTED_PYTHONS:
-        msg = (
-            "Detected Python version {}, which is not supported. "
-            "Only Python {} are supported."
-        ).format(
-            ".".join(map(str, sys.version_info[:2])),
-            ", ".join(".".join(map(str, v)) for v in SUPPORTED_PYTHONS),
-        )
-        raise RuntimeError(msg)
-
-    bazel_env = dict(os.environ, PYTHON3_BIN_PATH=sys.executable)
-
-    bazel_flags = ["--verbose_failures"]
-    if BAZEL_MAX_JOBS:
-        n = int(BAZEL_MAX_JOBS)  # the value must be an int
-        bazel_flags.append("--jobs")
-        bazel_flags.append(f"{n}")
-
-    bazel_precmd_flags = []
-
-    bazel_targets = []
-    bazel_targets += ["//spu:init", "//spu/utils:distributed"] if build_python else []
-    bazel_targets += ["//spu:api"] if build_cpp else []
-
-    bazel_flags.extend(["-c", "opt"])
-
-    if sys.platform == "linux" and ENABLE_GPU_BUILD:
-        bazel_flags.extend(["--config=gpu"])
-
-    if platform.machine() == "x86_64":
-        bazel_flags.extend(["--config=avx"])
-
-    print(f"Build with extra flags = {bazel_flags}")
-
-    return bazel_invoke(
-        subprocess.check_call,
-        bazel_precmd_flags + ["build"] + bazel_flags + ["--"] + bazel_targets,
-        env=bazel_env,
-    )
-
-
-def remove_prefix(text, prefix):
-    return text[text.startswith(prefix) and len(prefix) :]
-
-
-def copy_file(target_dir, filename, rootdir):
-    source = os.path.relpath(filename, rootdir)
-    destination = os.path.join(target_dir, remove_prefix(source, 'bazel-bin/'))
-
-    # Create the target directory if it doesn't already exist.
-    os.makedirs(os.path.dirname(destination), exist_ok=True)
-    if not os.path.exists(destination):
-        print(f"Copy file from {source} to {destination}")
-        shutil.copy(source, destination, follow_symlinks=True)
-        return 1
-    return 0
-
-
-def remove_file(target_dir, filename):
-    file = os.path.join(target_dir, filename)
-    if os.path.exists(file):
-        print(f"delete {file}")
-        os.remove(file)
-        return 1
-    return 0
-
-
-def fix_pb(file, old_pattern, new_pattern):
-    os.chmod(file, 0o666)
-    with open(file, 'r+') as f:
-        content = f.read()
-        content = content.replace(old_pattern, new_pattern)
-
-    with open(file, 'w+') as f:
-        f.write(content)
-
-
-def pip_run(build_ext):
-    build(True, True)
-
-    # Change __module__ in psi_pb2.py and pir_pb2.py
-    fix_pb('bazel-bin/spu/psi_pb2.py', 'psi.psi.psi_pb2', 'spu.psi_pb2')
-    fix_pb('bazel-bin/spu/link_pb2.py', 'yacl.link.link_pb2', 'link.pir_pb2')
-    fix_pb('bazel-bin/spu/psi_v2_pb2.py', 'psi.proto.psi_v2_pb2', 'spu.psi_pb2')
-    fix_pb('bazel-bin/spu/pir_pb2.py', 'psi.pir.pir_pb2', 'spu.pir_pb2')
-
-    setup_spec.files_to_include += spu_lib_files
-
-    # Copy over the autogenerated protobuf Python bindings.
-    for directory in generated_python_directories:
-        for filename in os.listdir(directory):
-            if filename[-3:] == ".py":
-                setup_spec.files_to_include.append(os.path.join(directory, filename))
-
-    copied_files = 0
-    for filename in setup_spec.files_to_include:
-        copied_files += copy_file(build_ext.build_lib, filename, ROOT_DIR)
-    print("# of files copied to {}: {}".format(build_ext.build_lib, copied_files))
-
-    deleted_files = 0
-    for filename in files_to_remove:
-        deleted_files += remove_file(build_ext.build_lib, filename)
-    print("# of files deleted in {}: {}".format(build_ext.build_lib, deleted_files))
-
-
-class build_ext(setuptools.command.build_ext.build_ext):
-    def run(self):
-        return pip_run(self)
-
-
-class BinaryDistribution(setuptools.Distribution):
-    def has_ext_modules(self):
-        return True
-
-
-# Ensure no remaining lib files.
-build_dir = os.path.join(ROOT_DIR, "build")
-if os.path.isdir(build_dir):
-    shutil.rmtree(build_dir)
-
-if not SKIP_BAZEL_CLEAN:
-    bazel_invoke(subprocess.check_call, ['clean'])
-
-# Default Linux platform tag
-plat_name = "manylinux2014_x86_64"
-
-if sys.platform == "darwin":
-    # Due to a bug in conda x64 python, platform tag has to be 10_16 for X64 wheel
-    if platform.machine() == "x86_64":
-        plat_name = "macosx_13_0_x86_64"
-    else:
-        plat_name = "macosx_13_0_arm64"
-elif platform.machine() == "aarch64":
-    # Linux aarch64
-    plat_name = "manylinux_2_28_aarch64"
-
-setuptools.setup(
-    name=setup_spec.name,
-    version=setup_spec.version,
-    author="SecretFlow Team",
-    author_email='secretflow-contact@service.alipay.com',
-    description=(setup_spec.description),
-    long_description=io.open(
-        os.path.join(ROOT_DIR, "README.md"), "r", encoding="utf-8"
-    ).read(),
-    long_description_content_type='text/markdown',
-    url="https://github.com/secretflow/spu",
-    keywords=("spu mpc secretflow compiler vm ABY3 secure computation"),
-    classifiers=[
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-    ],
-    packages=setup_spec.get_packages(),
-    cmdclass={"build_ext": build_ext},
-    # The BinaryDistribution argument triggers build_ext.
-    distclass=BinaryDistribution,
-    install_requires=setup_spec.install_requires,
-    setup_requires=["wheel"],
-    extras_require=setup_spec.extras,
-    license="Apache 2.0",
-    options={'bdist_wheel': {'plat_name': plat_name}},
-)
diff --git a/sml/cluster/BUILD.bazel b/sml/cluster/BUILD.bazel
index 641962ee9..5f789eb76 100644
--- a/sml/cluster/BUILD.bazel
+++ b/sml/cluster/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_binary")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_binary(
     name = "kmeans",
     srcs = ["kmeans.py"],
 )
diff --git a/sml/cluster/tests/BUILD.bazel b/sml/cluster/tests/BUILD.bazel
index 9e30b6f68..413069c74 100644
--- a/sml/cluster/tests/BUILD.bazel
+++ b/sml/cluster/tests/BUILD.bazel
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_test")
+load("//bazel:spu.bzl", "spu_py_test")
 
 package(default_visibility = ["//visibility:public"])
 
-py_test(
+spu_py_test(
     name = "kmeans_test",
     srcs = ["kmeans_test.py"],
     deps = [
         "//sml/cluster:kmeans",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/decomposition/BUILD.bazel b/sml/decomposition/BUILD.bazel
index c80a6751d..38e4d92f4 100644
--- a/sml/decomposition/BUILD.bazel
+++ b/sml/decomposition/BUILD.bazel
@@ -12,17 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "pca",
     srcs = ["pca.py"],
     deps = ["//sml/utils:extmath"],
 )
 
-py_library(
+spu_py_library(
     name = "nmf",
     srcs = ["nmf.py"],
 )
diff --git a/sml/decomposition/tests/BUILD.bazel b/sml/decomposition/tests/BUILD.bazel
index fff722206..ae5784aa7 100644
--- a/sml/decomposition/tests/BUILD.bazel
+++ b/sml/decomposition/tests/BUILD.bazel
@@ -23,6 +23,7 @@ py_test(
         "//sml/decomposition:pca",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
 
@@ -33,5 +34,6 @@ py_test(
         "//sml/decomposition:nmf",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/ensemble/BUILD.bazel b/sml/ensemble/BUILD.bazel
index 2572dc683..88ab3d102 100644
--- a/sml/ensemble/BUILD.bazel
+++ b/sml/ensemble/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "adaboost",
     srcs = ["adaboost.py"],
     deps = [
@@ -24,7 +24,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "forest",
     srcs = ["forest.py"],
     deps = [
diff --git a/sml/ensemble/tests/BUILD.bazel b/sml/ensemble/tests/BUILD.bazel
index 6815cf853..5b068b310 100644
--- a/sml/ensemble/tests/BUILD.bazel
+++ b/sml/ensemble/tests/BUILD.bazel
@@ -23,6 +23,7 @@ py_test(
         "//sml/ensemble:adaboost",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
 
@@ -33,5 +34,6 @@ py_test(
         "//sml/ensemble:forest",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/feature_selection/BUILD.bazel b/sml/feature_selection/BUILD.bazel
index dc512dcb3..e2acd5f66 100644
--- a/sml/feature_selection/BUILD.bazel
+++ b/sml/feature_selection/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "univariate_selection",
     srcs = ["univariate_selection.py"],
 )
diff --git a/sml/feature_selection/tests/BUILD.bazel b/sml/feature_selection/tests/BUILD.bazel
index 1a776e0bb..4a570e135 100644
--- a/sml/feature_selection/tests/BUILD.bazel
+++ b/sml/feature_selection/tests/BUILD.bazel
@@ -23,5 +23,6 @@ py_test(
         "//sml/feature_selection:univariate_selection",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/gaussian_process/BUILD.bazel b/sml/gaussian_process/BUILD.bazel
index 0c8f6d1a4..b0e219ddf 100644
--- a/sml/gaussian_process/BUILD.bazel
+++ b/sml/gaussian_process/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "_gpc",
     srcs = ["_gpc.py"],
     deps = [
@@ -25,12 +25,12 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "kernels",
     srcs = ["kernels.py"],
 )
 
-py_library(
+spu_py_library(
     name = "ovo_ovr",
     srcs = ["ovo_ovr.py"],
 )
diff --git a/sml/gaussian_process/tests/BUILD.bazel b/sml/gaussian_process/tests/BUILD.bazel
index 52e9a0a3c..da0c809df 100644
--- a/sml/gaussian_process/tests/BUILD.bazel
+++ b/sml/gaussian_process/tests/BUILD.bazel
@@ -23,5 +23,6 @@ py_test(
         "//sml/gaussian_process:_gpc",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/linear_model/BUILD.bazel b/sml/linear_model/BUILD.bazel
index fa4fdd158..69276ee8b 100644
--- a/sml/linear_model/BUILD.bazel
+++ b/sml/linear_model/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "sgd_classifier",
     srcs = ["sgd_classifier.py"],
     deps = [
@@ -24,7 +24,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "logistic",
     srcs = ["logistic.py"],
     deps = [
@@ -32,7 +32,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "ridge",
     srcs = ["ridge.py"],
     deps = [
@@ -40,12 +40,12 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "pla",
     srcs = ["pla.py"],
 )
 
-py_binary(
+spu_py_library(
     name = "glm",
     srcs = ["glm.py"],
     deps = [
@@ -55,7 +55,7 @@ py_binary(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "quantile",
     srcs = ["quantile.py"],
     deps = [
diff --git a/sml/linear_model/tests/BUILD.bazel b/sml/linear_model/tests/BUILD.bazel
index f729c2067..327be63d9 100644
--- a/sml/linear_model/tests/BUILD.bazel
+++ b/sml/linear_model/tests/BUILD.bazel
@@ -27,6 +27,8 @@ py_test(
         "//sml/linear_model:sgd_classifier",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip//jax:pkg",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
 
@@ -37,6 +39,8 @@ py_test(
         "//sml/linear_model:logistic",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//pandas:pkg",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
 
@@ -48,6 +52,7 @@ py_test(
         "//sml/linear_model:ridge",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
 
@@ -58,6 +63,9 @@ py_test(
         "//sml/linear_model:pla",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip//jax:pkg",
+        "@spu_pip_dev//pandas:pkg",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
 
@@ -68,6 +76,7 @@ py_test(
         "//sml/linear_model:glm",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
 
@@ -78,5 +87,8 @@ py_test(
         "//sml/linear_model:quantile",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip//jax:pkg",
+        "@spu_pip_dev//pandas:pkg",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/linear_model/utils/BUILD.bazel b/sml/linear_model/utils/BUILD.bazel
index 273290734..c7ff6fe2f 100644
--- a/sml/linear_model/utils/BUILD.bazel
+++ b/sml/linear_model/utils/BUILD.bazel
@@ -12,27 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "link",
     srcs = ["link.py"],
 )
 
-py_library(
+spu_py_library(
     name = "loss",
     srcs = ["loss.py"],
     deps = [":link"],
 )
 
-py_library(
+spu_py_library(
     name = "solver",
     srcs = ["solver.py"],
 )
 
-py_library(
+spu_py_library(
     name = "_linprog_simplex",
     srcs = ["_linprog_simplex.py"],
 )
diff --git a/sml/metrics/classification/BUILD.bazel b/sml/metrics/classification/BUILD.bazel
index 0bcaebe59..446294dbf 100644
--- a/sml/metrics/classification/BUILD.bazel
+++ b/sml/metrics/classification/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library", "py_test")
+load("//bazel:spu.bzl", "spu_py_library", "spu_py_test")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "classification",
     srcs = ["classification.py"],
     deps = [
@@ -26,23 +26,24 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "auc",
     srcs = ["auc.py"],
     deps = ["//spu/ops/groupby"],
 )
 
-py_test(
+spu_py_test(
     name = "classification_test",
     srcs = ["classification_test.py"],
     deps = [
         ":classification",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
 
-py_binary(
+spu_py_library(
     name = "classification_emul",
     srcs = ["classification_emul.py"],
     deps = [
diff --git a/sml/metrics/regression/BUILD.bazel b/sml/metrics/regression/BUILD.bazel
index 88961748e..7e9ef59c4 100644
--- a/sml/metrics/regression/BUILD.bazel
+++ b/sml/metrics/regression/BUILD.bazel
@@ -12,26 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library", "py_test")
+load("//bazel:spu.bzl", "spu_py_binary", "spu_py_library", "spu_py_test")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "regression",
     srcs = ["regression.py"],
 )
 
-py_test(
+spu_py_test(
     name = "regression_test",
     srcs = ["regression_test.py"],
     deps = [
         ":regression",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
 
-py_binary(
+spu_py_binary(
     name = "regression_emul",
     srcs = ["regression_emul.py"],
     deps = [
diff --git a/sml/naive_bayes/BUILD.bazel b/sml/naive_bayes/BUILD.bazel
index 369cd2c5d..7b698db21 100644
--- a/sml/naive_bayes/BUILD.bazel
+++ b/sml/naive_bayes/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "gnb",
     srcs = ["gnb.py"],
 )
diff --git a/sml/naive_bayes/tests/BUILD.bazel b/sml/naive_bayes/tests/BUILD.bazel
index ca41e4605..8d4680fc8 100644
--- a/sml/naive_bayes/tests/BUILD.bazel
+++ b/sml/naive_bayes/tests/BUILD.bazel
@@ -23,5 +23,6 @@ py_test(
         "//sml/naive_bayes:gnb",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/neighbors/BUILD.bazel b/sml/neighbors/BUILD.bazel
index ff98f7eb8..beb0cc908 100644
--- a/sml/neighbors/BUILD.bazel
+++ b/sml/neighbors/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "knn",
     srcs = ["knn.py"],
 )
diff --git a/sml/neighbors/tests/BUILD.bazel b/sml/neighbors/tests/BUILD.bazel
index 5c7052793..c294ab560 100644
--- a/sml/neighbors/tests/BUILD.bazel
+++ b/sml/neighbors/tests/BUILD.bazel
@@ -23,5 +23,6 @@ py_test(
         "//sml/neighbors:knn",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/preprocessing/BUILD.bazel b/sml/preprocessing/BUILD.bazel
index 958236c84..ddc2ad7ca 100644
--- a/sml/preprocessing/BUILD.bazel
+++ b/sml/preprocessing/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "preprocessing",
     srcs = ["preprocessing.py"],
     deps = [
diff --git a/sml/preprocessing/tests/BUILD.bazel b/sml/preprocessing/tests/BUILD.bazel
index 994ed985d..ffb24ef30 100644
--- a/sml/preprocessing/tests/BUILD.bazel
+++ b/sml/preprocessing/tests/BUILD.bazel
@@ -23,5 +23,6 @@ py_test(
         "//sml/preprocessing",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/svm/BUILD.bazel b/sml/svm/BUILD.bazel
index fccd63ff4..514cf7854 100644
--- a/sml/svm/BUILD.bazel
+++ b/sml/svm/BUILD.bazel
@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "smo",
     srcs = ["smo.py"],
 )
 
-py_library(
+spu_py_library(
     name = "svm",
     srcs = ["svm.py"],
     deps = [
diff --git a/sml/svm/tests/BUILD.bazel b/sml/svm/tests/BUILD.bazel
index 91f54ba01..011da2277 100644
--- a/sml/svm/tests/BUILD.bazel
+++ b/sml/svm/tests/BUILD.bazel
@@ -23,5 +23,6 @@ py_test(
         "//sml/svm",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/tree/BUILD.bazel b/sml/tree/BUILD.bazel
index 439ac5ea0..db4487c74 100644
--- a/sml/tree/BUILD.bazel
+++ b/sml/tree/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "tree",
     srcs = ["tree.py"],
 )
diff --git a/sml/tree/tests/BUILD.bazel b/sml/tree/tests/BUILD.bazel
index 2b60bdb16..e96e9140a 100644
--- a/sml/tree/tests/BUILD.bazel
+++ b/sml/tree/tests/BUILD.bazel
@@ -23,5 +23,6 @@ py_test(
         "//sml/tree",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
diff --git a/sml/utils/BUILD.bazel b/sml/utils/BUILD.bazel
index 26866239c..27bb8f7fa 100644
--- a/sml/utils/BUILD.bazel
+++ b/sml/utils/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "emulation",
     srcs = [
         "emulation.py",
@@ -31,12 +31,12 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "fxp_approx",
     srcs = ["fxp_approx.py"],
 )
 
-py_library(
+spu_py_library(
     name = "extmath",
     srcs = ["extmath.py"],
 )
diff --git a/spu/BUILD.bazel b/spu/BUILD.bazel
index 839f5fad4..2fbe08df3 100644
--- a/spu/BUILD.bazel
+++ b/spu/BUILD.bazel
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+load("@protobuf//bazel:py_proto_library.bzl", "py_proto_library")
 load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
-load("@rules_proto_grpc//python:defs.bzl", "python_proto_compile")
-load("@rules_python//python:defs.bzl", "py_library")
 load("@rules_python//python:packaging.bzl", "py_package")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -66,13 +66,13 @@ pybind_extension(
     deps = [
         ":exported_symbols.lds",
         ":version_script.lds",
-        "@psi//psi:launch",
+        "@psi//psi/apps/psi_launcher:launch",
         "@psi//psi/legacy:memory_psi",
         "@yacl//yacl/link",
     ],
 )
 
-py_library(
+spu_py_library(
     name = "api",
     srcs = [
         "api.py",
@@ -84,22 +84,19 @@ py_library(
     ],
 )
 
-python_proto_compile(
+py_proto_library(
     name = "psi_py_proto",
-    output_mode = "NO_PREFIX_FLAT",
-    protos = ["@psi//psi/proto:psi_proto"],
+    deps = ["@psi//psi/proto:psi_proto"],
 )
 
-python_proto_compile(
+py_proto_library(
     name = "link_py_proto",
-    output_mode = "NO_PREFIX_FLAT",
-    protos = ["@yacl//yacl/link:link_proto"],
+    deps = ["@yacl//yacl/link:link_proto"],
 )
 
-python_proto_compile(
+py_proto_library(
     name = "psi_v2_py_proto",
-    output_mode = "NO_PREFIX",
-    protos = ["@psi//psi/proto:psi_v2_proto"],
+    deps = ["@psi//psi/proto:psi_v2_proto"],
 )
 
 # Hack generated protobuf due to https://github.com/protocolbuffers/protobuf/issues/1491
@@ -107,16 +104,39 @@ genrule(
     name = "psi_v2_py_proto_fixed",
     srcs = [":psi_v2_py_proto"],
     outs = ["psi_v2_pb2.py"],
-    cmd = "sed 's#from yacl.link import#from . import#g;s#from psi.proto import#from . import#g' $(SRCS) > $(OUTS)",
+    cmd = "sed 's#from yacl.link import#from . import#g;s#from psi.proto import#from . import#g;s#psi.proto.psi_v2_pb2#spu.psi_pb2#g' $(SRCS) > $(OUTS)",
 )
 
-py_library(
+genrule(
+    name = "psi_py_proto_fixed",
+    srcs = [":psi_py_proto"],
+    outs = ["psi_pb2.py"],
+    cmd = "sed 's/psi.proto.psi_pb2/spu.psi_pb2/g' $(SRCS) > $(OUTS)",
+)
+
+genrule(
+    name = "pir_py_proto_fixed",
+    srcs = [":pir_py_proto"],
+    outs = ["pir_pb2.py"],
+    cmd = """
+    sed "s/psi.proto.pir_pb2/spu.pir_pb2/g" $(SRCS) > $(OUTS)
+    """,
+)
+
+genrule(
+    name = "link_py_proto_fixed",
+    srcs = [":link_py_proto"],
+    outs = ["link_pb2.py"],
+    cmd = "sed 's/yacl.link.link_pb2/link.pir_pb2/g' $(SRCS) > $(OUTS)",
+)
+
+spu_py_library(
     name = "psi",
     srcs = [
         "psi.py",
-        ":link_py_proto",
-        ":pir_py_proto",
-        ":psi_py_proto",
+        ":link_py_proto_fixed",
+        ":pir_py_proto_fixed",
+        ":psi_py_proto_fixed",
         ":psi_v2_py_proto_fixed",
     ],
     data = [
@@ -125,13 +145,12 @@ py_library(
     ],
 )
 
-python_proto_compile(
+py_proto_library(
     name = "pir_py_proto",
-    output_mode = "NO_PREFIX_FLAT",
-    protos = ["@psi//psi/proto:pir_proto"],
+    deps = ["@psi//psi/proto:pir_proto"],
 )
 
-py_library(
+spu_py_library(
     name = "init",
     srcs = [
         "__init__.py",
diff --git a/spu/experimental/BUILD.bazel b/spu/experimental/BUILD.bazel
index 09f592d25..82dbfe193 100644
--- a/spu/experimental/BUILD.bazel
+++ b/spu/experimental/BUILD.bazel
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
-py_library(
+spu_py_library(
     name = "experimentals",
     srcs = [
         "__init__.py",
@@ -28,7 +28,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "make_cached_var",
     srcs = [
         "make_cached_var_impl.py",
@@ -38,7 +38,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "drop_cached_var",
     srcs = [
         "drop_cached_var_impl.py",
diff --git a/spu/intrinsic/BUILD.bazel b/spu/intrinsic/BUILD.bazel
index 691e3fa4a..3af31bf86 100644
--- a/spu/intrinsic/BUILD.bazel
+++ b/spu/intrinsic/BUILD.bazel
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
-py_library(
+spu_py_library(
     name = "all_intrinsics",
     srcs = [
         "__init__.py",
@@ -29,7 +29,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "example",
     srcs = [
         "example_impl.py",
@@ -39,7 +39,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "example_binary",
     srcs = [
         "example_binary_impl.py",
diff --git a/spu/intrinsic/add_new_intrinsic.py b/spu/intrinsic/add_new_intrinsic.py
index 5c77092c2..0aad6bd8d 100755
--- a/spu/intrinsic/add_new_intrinsic.py
+++ b/spu/intrinsic/add_new_intrinsic.py
@@ -98,7 +98,7 @@ def adapt_build(module_path, check_name):
         content = (
             content
             + f"""
-py_library(
+spu_py_library(
     name = "{check_name}",
     srcs = [
         "{check_name}_impl.py",
diff --git a/spu/libpsi.cc b/spu/libpsi.cc
index 0beb06eb9..9c7ca4d1d 100644
--- a/spu/libpsi.cc
+++ b/spu/libpsi.cc
@@ -18,7 +18,7 @@
 #include "yacl/base/exception.h"
 #include "yacl/link/context.h"
 
-#include "psi/launch.h"
+#include "psi/apps/psi_launcher/launch.h"
 #include "psi/legacy/memory_psi.h"
 #include "psi/utils/progress.h"
 
diff --git a/spu/ops/groupby/BUILD.bazel b/spu/ops/groupby/BUILD.bazel
index 2b6747a04..b38f10a40 100644
--- a/spu/ops/groupby/BUILD.bazel
+++ b/spu/ops/groupby/BUILD.bazel
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_library", "py_test")
+load("//bazel:spu.bzl", "spu_py_library", "spu_py_test")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "groupby",
     srcs = [
         "__init__.py",
@@ -31,7 +31,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "segmentation",
     srcs = [
         "segmentation.py",
@@ -39,7 +39,7 @@ py_library(
     deps = [":utils"],
 )
 
-py_library(
+spu_py_library(
     name = "aggregation",
     srcs = [
         "aggregation.py",
@@ -47,14 +47,14 @@ py_library(
     deps = [":utils"],
 )
 
-py_library(
+spu_py_library(
     name = "utils",
     srcs = [
         "utils.py",
     ],
 )
 
-py_library(
+spu_py_library(
     name = "groupby_via_shuffle",
     srcs = [
         "groupby_via_shuffle.py",
@@ -65,7 +65,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "shuffle",
     srcs = [
         "shuffle.py",
@@ -73,14 +73,14 @@ py_library(
     deps = [":utils"],
 )
 
-py_library(
+spu_py_library(
     name = "postprocess",
     srcs = [
         "postprocess.py",
     ],
 )
 
-py_test(
+spu_py_test(
     name = "groupby_test",
     srcs = ["groupby_test.py"],
     deps = [
@@ -90,5 +90,6 @@ py_test(
         ":segmentation",
         "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//pandas:pkg",
     ],
 )
diff --git a/spu/tests/BUILD.bazel b/spu/tests/BUILD.bazel
index d9e348cbb..3c0289316 100644
--- a/spu/tests/BUILD.bazel
+++ b/spu/tests/BUILD.bazel
@@ -12,20 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("@rules_python//python:defs.bzl", "py_binary", "py_library", "py_test")
+load("//bazel:spu.bzl", "spu_py_binary", "spu_py_library", "spu_py_test")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "jnp_testbase",
     srcs = ["jnp_testbase.py"],
     deps = [
         "//spu:api",
+        "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//absl_py:pkg",
     ],
 )
 
-py_library(
+spu_py_library(
     name = "utils",
     srcs = ["utils.py"],
     deps = [
@@ -33,7 +35,7 @@ py_library(
     ],
 )
 
-py_binary(
+spu_py_binary(
     name = "np_op_status",
     srcs = ["np_op_status.py"],
     deps = [
@@ -41,7 +43,7 @@ py_binary(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_aby3_r128_test",
     timeout = "long",
     srcs = ["jnp_aby3_r128_test.py"],
@@ -50,7 +52,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_aby3_r128_test_x64",
     timeout = "long",
     srcs = ["jnp_aby3_r128_test.py"],
@@ -63,7 +65,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_aby3_r64_test",
     timeout = "long",
     srcs = ["jnp_aby3_r64_test.py"],
@@ -72,7 +74,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_aby3_r64_test_x64",
     timeout = "long",
     srcs = ["jnp_aby3_r64_test.py"],
@@ -85,7 +87,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_cheetah_r64_test",
     size = "enormous",
     srcs = ["jnp_cheetah_r64_test.py"],
@@ -94,7 +96,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_cheetah_r64_test_x64",
     size = "enormous",
     srcs = ["jnp_cheetah_r64_test.py"],
@@ -107,7 +109,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_semi2k_r128_test",
     timeout = "long",
     srcs = ["jnp_semi2k_r128_test.py"],
@@ -116,7 +118,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_semi2k_r128_test_x64",
     timeout = "long",
     srcs = ["jnp_semi2k_r128_test.py"],
@@ -129,7 +131,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_semi2k_r64_test",
     timeout = "long",
     srcs = ["jnp_semi2k_r64_test.py"],
@@ -138,7 +140,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_semi2k_r64_test_x64",
     timeout = "long",
     srcs = ["jnp_semi2k_r64_test.py"],
@@ -151,7 +153,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_ref2k_r64_test",
     timeout = "long",
     srcs = ["jnp_ref2k_r64_test.py"],
@@ -160,7 +162,7 @@ py_test(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jnp_ref2k_r64_test_x64",
     timeout = "long",
     srcs = ["jnp_ref2k_r64_test.py"],
@@ -173,7 +175,7 @@ py_test(
     ],
 )
 
-py_binary(
+spu_py_binary(
     name = "jnp_debug",
     srcs = ["jnp_debug.py"],
     deps = [
@@ -182,52 +184,58 @@ py_binary(
     ],
 )
 
-py_test(
+spu_py_test(
     name = "spu_compiler_test",
     srcs = ["spu_compiler_test.py"],
     deps = [
         "//spu:api",
+        "//spu:init",
         "//spu/utils:frontend",
         "//spu/utils:simulation",
     ],
 )
 
-py_test(
+spu_py_test(
     name = "jax_sanity_test",
     srcs = ["jax_sanity_test.py"],
     deps = [
         ":jnp_testbase",
+        "@spu_pip_dev//scikit_learn:pkg",
     ],
 )
 
-py_test(
+spu_py_test(
     name = "spu_runtime_test",
     srcs = ["spu_runtime_test.py"],
     deps = [
         "//spu:api",
+        "//spu:init",
         "//spu/utils:simulation",
     ],
 )
 
-py_test(
+spu_py_test(
     name = "spu_io_test",
     srcs = ["spu_io_test.py"],
     deps = [
         "//spu:api",
+        "//spu:init",
         "//spu/utils:simulation",
+        "@spu_pip_dev//absl_py:pkg",
     ],
 )
 
-py_test(
+spu_py_test(
     name = "link_test",
     srcs = ["link_test.py"],
     deps = [
         ":utils",
         "//spu:api",
+        "//spu:init",
     ],
 )
 
-py_test(
+spu_py_test(
     name = "legacy_psi_test",
     srcs = ["legacy_psi_test.py"],
     data = [
@@ -239,12 +247,13 @@ py_test(
     ],
     deps = [
         ":utils",
+        "//spu:init",
         "//spu:psi",
         "//spu/utils:simulation",
     ],
 )
 
-py_test(
+spu_py_test(
     name = "psi_test",
     srcs = ["psi_test.py"],
     data = [
@@ -256,11 +265,12 @@ py_test(
     ],
     deps = [
         ":utils",
+        "//spu:init",
         "//spu:psi",
     ],
 )
 
-py_test(
+spu_py_test(
     name = "ub_psi_test",
     srcs = ["ub_psi_test.py"],
     data = [
@@ -272,11 +282,12 @@ py_test(
     ],
     deps = [
         ":utils",
+        "//spu:init",
         "//spu:psi",
     ],
 )
 
-py_test(
+spu_py_test(
     name = "pir_test",
     srcs = ["pir_test.py"],
     data = [
@@ -288,19 +299,29 @@ py_test(
     ],
     deps = [
         ":utils",
+        "//spu:init",
         "//spu:psi",
+        "@spu_pip_dev//pandas:pkg",
     ],
 )
 
-py_test(
+spu_py_test(
     name = "frontend_test",
     srcs = ["frontend_test.py"],
     deps = [
+        "//spu:init",
         "//spu/utils:frontend",
-    ],
+    ] + select({
+        "@bazel_tools//src/conditions:linux_x86_64": [
+            "@spu_pip_dev//tensorflow_cpu:pkg",
+        ],
+        "//conditions:default": [
+            "@spu_pip_dev//tensorflow:pkg",
+        ],
+    }),
 )
 
-py_test(
+spu_py_test(
     name = "distributed_test",
     timeout = "short",
     srcs = ["distributed_test.py"],
@@ -310,14 +331,24 @@ py_test(
     ],
     deps = [
         ":utils",
+        "//spu:init",
         "//spu/utils:distributed",
-    ],
+        "@spu_pip_dev//grpcio:pkg",
+    ] + select({
+        "@bazel_tools//src/conditions:linux_x86_64": [
+            "@spu_pip_dev//tensorflow_cpu:pkg",
+        ],
+        "//conditions:default": [
+            "@spu_pip_dev//tensorflow:pkg",
+        ],
+    }),
 )
 
-py_test(
+spu_py_test(
     name = "jax_compile_test",
     srcs = ["jax_compile_test.py"],
     deps = [
         ":jnp_testbase",
+        "@spu_pip_dev//flax:pkg",
     ],
 )
diff --git a/spu/tests/legacy_psi_test.py b/spu/tests/legacy_psi_test.py
index 477e964c7..e286bf2c7 100644
--- a/spu/tests/legacy_psi_test.py
+++ b/spu/tests/legacy_psi_test.py
@@ -109,20 +109,6 @@ def prep_data(self):
 
         return data, expected
 
-    def test_reveal(self):
-        data, expected = self.prep_data()
-        expected.sort()
-
-        def fn(lctx):
-            config = psi.MemoryPsiConfig(
-                psi_type=psi.PsiType.ECDH_PSI_2PC, broadcast_result=True
-            )
-            joint = psi.mem_psi(lctx, config, data[lctx.rank])
-            joint.sort()
-            return self.assertEqual(joint, expected)
-
-        self.run_psi(fn)
-
     def test_reveal_to(self):
         data, expected = self.prep_data()
         expected.sort()
@@ -161,28 +147,6 @@ def test_ecdh_3pc(self):
             3, inputs, outputs, selected_fields, psi.PsiType.ECDH_PSI_3PC
         )
 
-    def test_kkrt_2pc(self):
-        print("----------test_kkrt_2pc-------------")
-
-        inputs = ["spu/tests/data/alice.csv", "spu/tests/data/bob.csv"]
-        outputs = ["./alice-kkrt.csv", "./bob-kkrt.csv"]
-        selected_fields = ["id", "idx"]
-
-        self.run_streaming_psi(
-            2, inputs, outputs, selected_fields, psi.PsiType.KKRT_PSI_2PC
-        )
-
-    def test_ecdh_2pc(self):
-        print("----------test_ecdh_2pc-------------")
-
-        inputs = ["spu/tests/data/alice.csv", "spu/tests/data/bob.csv"]
-        outputs = ["./alice-ecdh.csv", "./bob-ecdh.csv"]
-        selected_fields = ["id", "idx"]
-
-        self.run_streaming_psi(
-            2, inputs, outputs, selected_fields, psi.PsiType.ECDH_PSI_2PC
-        )
-
     def test_dppsi_2pc(self):
         print("----------test_dppsi_2pc-------------")
 
diff --git a/spu/utils/BUILD.bazel b/spu/utils/BUILD.bazel
index 71f9e5448..f537897ab 100644
--- a/spu/utils/BUILD.bazel
+++ b/spu/utils/BUILD.bazel
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+load("@grpc//bazel:python_rules.bzl", "py_grpc_library", "py_proto_library")
 load("@rules_proto//proto:defs.bzl", "proto_library")
-load("@rules_proto_grpc//python:defs.bzl", "python_grpc_compile")
-load("@rules_python//python:defs.bzl", "py_library")
+load("//bazel:spu.bzl", "spu_py_library")
 
 package(default_visibility = ["//visibility:public"])
 
-py_library(
+spu_py_library(
     name = "simulation",
     srcs = ["simulation.py"],
     deps = [
@@ -32,14 +32,19 @@ proto_library(
     srcs = ["distributed.proto"],
 )
 
-python_grpc_compile(
+py_proto_library(
+    name = "distributed_py_proto",
+    deps = [":distributed_proto"],
+)
+
+py_grpc_library(
     name = "distributed_py_proto_grpc",
-    output_mode = "NO_PREFIX",
-    prefix_path = "../..",
-    protos = ["distributed_proto"],
+    srcs = [":distributed_proto"],
+    strip_prefixes = ["../.."],
+    deps = [":distributed_py_proto"],
 )
 
-py_library(
+spu_py_library(
     name = "distributed_impl",
     srcs = [
         "distributed_impl.py",
@@ -52,7 +57,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "distributed",
     srcs = [
         "distributed.py",
@@ -63,7 +68,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "frontend",
     srcs = ["frontend.py"],
     deps = [
@@ -71,7 +76,7 @@ py_library(
     ],
 )
 
-py_library(
+spu_py_library(
     name = "polyfill",
     srcs = ["polyfill.py"],
 )
diff --git a/version.bzl b/version.bzl
new file mode 100644
index 000000000..04b358354
--- /dev/null
+++ b/version.bzl
@@ -0,0 +1,38 @@
+# Copyright 2024 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SPU_VERSION = "0.9.4.dev20250103"
+
+def _spu_version_gen(ctx):
+    ctx.actions.expand_template(
+        template = ctx.file.template,
+        output = ctx.outputs.out,
+        substitutions = {
+            "@SPU_VERSION@": SPU_VERSION,
+        },
+    )
+
+spu_version_gen = rule(
+    implementation = _spu_version_gen,
+    attrs = {
+        "template": attr.label(
+            mandatory = True,
+            allow_single_file = True,
+        ),
+        "out": attr.output(
+            mandatory = True,
+        ),
+    },
+    output_to_genfiles = True,
+)