From f44f09dfd263c2bb0238f4323e366a210e9d4aae Mon Sep 17 00:00:00 2001 From: Jenkins Date: Mon, 11 Nov 2024 11:22:13 +0000 Subject: [PATCH] Compute Library v24.11 --- Android.bp | 3 + CMakeLists.txt | 2 +- LICENSES/Apache-2.0.txt | 15 + LICENSE => LICENSES/MIT.txt | 9 +- README.md | 24 +- SConscript | 10 +- SConstruct | 21 +- arm_compute/core/CPP/CPPTypes.h | 8 +- arm_compute/core/QuantizationInfo.h | 78 ++- arm_compute/core/TensorInfo.h | 3 + arm_compute/core/utils/DataTypeUtils.h | 29 + .../function_info/ActivationLayerInfo.h | 1 + arm_compute/runtime/CL/functions/CLCast.h | 44 +- .../CPPBoxWithNonMaximaSuppressionLimit.h | 6 +- arm_compute/runtime/MemoryGroup.h | 30 +- arm_compute/runtime/MemoryManagerOnDemand.h | 10 +- arm_compute/runtime/MemoryRegion.h | 9 +- .../runtime/NEON/functions/NEArgMinMaxLayer.h | 6 +- .../NEON/functions/NEConvolutionLayer.h | 6 +- .../NEON/functions/NEDeconvolutionLayer.h | 14 +- .../functions/NEDepthwiseConvolutionLayer.h | 6 +- .../functions/NEDetectionPostProcessLayer.h | 14 +- arm_compute/runtime/NEON/functions/NEFFT1D.h | 6 +- .../NEON/functions/NEFFTConvolutionLayer.h | 14 +- .../NEON/functions/NEFullyConnectedLayer.h | 7 +- arm_compute/runtime/NEON/functions/NEGEMM.h | 6 +- .../runtime/NEON/functions/NEGEMMConv2d.h | 6 +- .../NEON/functions/NEGEMMConvolutionLayer.h | 6 +- .../functions/NEGEMMLowpMatrixMultiplyCore.h | 6 +- .../NEON/functions/NEGenerateProposalsLayer.h | 6 +- .../functions/NEInstanceNormalizationLayer.h | 6 +- .../NEON/functions/NEL2NormalizeLayer.h | 6 +- .../runtime/NEON/functions/NELSTMLayer.h | 6 +- .../NEON/functions/NELSTMLayerQuantized.h | 14 +- arm_compute/runtime/NEON/functions/NEMatMul.h | 6 +- .../NEON/functions/NENormalizationLayer.h | 6 +- .../runtime/NEON/functions/NEPooling3dLayer.h | 6 +- .../runtime/NEON/functions/NEPoolingLayer.h | 6 +- .../runtime/NEON/functions/NERNNLayer.h | 14 +- .../runtime/NEON/functions/NEReduceMean.h | 14 +- .../NEON/functions/NEReductionOperation.h | 6 +- .../runtime/NEON/functions/NESoftmaxLayer.h | 14 +- .../functions/NEWinogradConvolutionLayer.h | 6 +- .../low_level/CpuGemmAssemblyDispatch.h | 7 + .../experimental/operators/CpuDequantize.h | 82 +++ .../experimental/operators/CpuGEMMLowp.h | 96 +++ .../experimental/operators/CpuGemmConv2d.h | 7 + .../experimental/operators/CpuQuantize.h | 95 +++ docs/Doxyfile | 2 +- docs/user_guide/library.dox | 5 - docs/user_guide/operator_list.dox | 26 +- filelist.json | 14 +- scripts/format_code.py | 12 +- src/BUILD.bazel | 6 +- src/CMakeLists.txt | 4 + src/common/cpuinfo/CpuInfo.cpp | 7 +- src/core/CL/cl_kernels/common/cast.cl | 21 +- .../cl_kernels/common/quantization_layer.cl | 19 +- .../CL/cl_kernels/common/softmax_layer.cl | 6 +- src/core/CPP/CPPTypes.cpp | 8 +- src/core/NEON/NEAsymm.h | 36 + .../NEON/kernels/arm_gemm/gemm_bf16bf16.cpp | 2 + src/core/NEON/kernels/arm_gemm/utils.hpp | 10 +- src/core/NEON/wrapper/intrinsics/cvt.h | 20 +- src/core/common/Registrars.h | 8 +- src/core/helpers/LUTManager.cpp | 60 +- src/core/helpers/LUTManager.h | 13 +- src/cpu/kernels/CpuActivationKernel.cpp | 16 +- src/cpu/kernels/CpuCastKernel.cpp | 8 +- src/cpu/kernels/CpuIm2ColKernel.cpp | 4 +- src/cpu/kernels/CpuSoftmaxKernel.cpp | 69 +- src/cpu/kernels/CpuSoftmaxKernel.h | 14 +- src/cpu/kernels/cast/generic/neon/fp16.cpp | 6 +- .../elementwise_binary/generic/neon/impl.h | 625 +++++------------- src/cpu/kernels/quantize/generic/neon/impl.h | 42 +- .../reduction_layer/generic/neon/impl.h | 15 +- .../kernels/softmax/generic/sme2/qasymm8.cpp | 442 ++++++------- .../kernels/softmax/generic/sve/impl_bf16.cpp | 232 +++++++ src/cpu/kernels/softmax/list.h | 12 + src/cpu/operators/CpuDirectConv2d.h | 14 +- src/cpu/operators/CpuDirectConv3d.h | 14 +- .../CpuGemmLowpMatrixMultiplyCore.cpp | 2 + src/cpu/operators/CpuMatMul.cpp | 15 +- src/gpu/cl/kernels/ClCastKernel.cpp | 39 +- src/gpu/cl/kernels/ClQuantizeKernel.cpp | 63 +- src/gpu/cl/operators/ClCast.cpp | 10 +- src/gpu/cl/operators/ClCast.h | 23 +- src/runtime/MemoryManagerOnDemand.cpp | 13 +- .../NEON/functions/NEFullyConnectedLayer.cpp | 1 + src/runtime/NEON/functions/NEGEMM.cpp | 5 +- src/runtime/NEON/functions/NEGEMMConv2d.cpp | 3 +- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 5 +- .../NEGEMMLowpMatrixMultiplyCore.cpp | 36 +- src/runtime/NEON/functions/NEMatMul.cpp | 13 +- .../functions/NEWinogradConvolutionLayer.cpp | 3 +- .../low_level/CpuGemmAssemblyDispatch.cpp | 25 +- .../experimental/operators/CpuDequantize.cpp | 64 ++ .../experimental/operators/CpuGEMMLowp.cpp | 115 ++++ .../experimental/operators/CpuGemmConv2d.cpp | 5 + .../experimental/operators/CpuQuantize.cpp | 62 ++ support/Bfloat16.h | 2 +- support/SaturateCast.h | 20 +- tests/CMakeLists.txt | 87 +-- tests/framework/instruments/hwc_names.hpp | 10 +- tests/validation/CL/Cast.cpp | 393 ++++++----- tests/validation/CL/LogSoftmaxLayer.cpp | 79 ++- tests/validation/CL/QuantizationLayer.cpp | 52 +- tests/validation/CMakeLists.txt | 143 +--- tests/validation/CPP/LUT.cpp | 25 +- tests/validation/NEON/Cast.cpp | 129 ++-- tests/validation/NEON/GEMMLowp.cpp | 25 +- tests/validation/NEON/MatMul.cpp | 56 ++ tests/validation/NEON/Permute.cpp | 25 +- tests/validation/NEON/QuantizationLayer.cpp | 36 + tests/validation/NEON/ReduceMean.cpp | 88 +++ tests/validation/NEON/Scale.cpp | 21 +- tests/validation/NEON/SoftmaxLayer.cpp | 25 +- .../validation/NEON/UNIT/TensorAllocator.cpp | 12 +- tests/validation/Validation.h | 16 +- tests/validation/fixtures/CastFixture.h | 12 +- .../fixtures/CpuDequantizeFixture.h | 94 +++ .../validation/fixtures/CpuGEMMLowpFixture.h | 170 +++++ .../fixtures/CpuGemmAssemblyDispatchFixture.h | 286 ++++++-- .../fixtures/CpuGemmConv2dFixture.h | 177 +++++ .../validation/fixtures/CpuQuantizeFixture.h | 104 +++ tests/validation/fixtures/GEMMLowpFixture.h | 31 +- tests/validation/fixtures/PermuteFixture.h | 15 +- .../validation/fixtures/SoftmaxLayerFixture.h | 5 +- tests/validation/reference/GEMMLowp.cpp | 15 +- tests/validation/reference/SoftmaxLayer.cpp | 87 ++- tests/validation/reference/SoftmaxLayer.h | 11 +- .../low_level/CpuGemmAssemblyDispatch.cpp | 202 ++++-- .../experimental/operators/CpuDequantize.cpp | 125 ++++ .../experimental/operators/CpuGEMMLowp.cpp | 233 +++++++ .../experimental/operators/CpuGemmConv2d.cpp | 53 ++ .../experimental/operators/CpuQuantize.cpp | 100 +++ utils/TypePrinter.h | 11 + 137 files changed, 4412 insertions(+), 1659 deletions(-) create mode 100644 LICENSES/Apache-2.0.txt rename LICENSE => LICENSES/MIT.txt (82%) create mode 100644 arm_compute/runtime/experimental/operators/CpuDequantize.h create mode 100644 arm_compute/runtime/experimental/operators/CpuGEMMLowp.h create mode 100644 arm_compute/runtime/experimental/operators/CpuQuantize.h create mode 100644 src/cpu/kernels/softmax/generic/sve/impl_bf16.cpp create mode 100644 src/runtime/experimental/operators/CpuDequantize.cpp create mode 100644 src/runtime/experimental/operators/CpuGEMMLowp.cpp create mode 100644 src/runtime/experimental/operators/CpuQuantize.cpp create mode 100644 tests/validation/fixtures/CpuDequantizeFixture.h create mode 100644 tests/validation/fixtures/CpuGEMMLowpFixture.h create mode 100644 tests/validation/fixtures/CpuQuantizeFixture.h create mode 100644 tests/validation/runtime/experimental/operators/CpuDequantize.cpp create mode 100644 tests/validation/runtime/experimental/operators/CpuGEMMLowp.cpp create mode 100644 tests/validation/runtime/experimental/operators/CpuQuantize.cpp diff --git a/Android.bp b/Android.bp index d6516fec72..159aebb516 100644 --- a/Android.bp +++ b/Android.bp @@ -1025,11 +1025,14 @@ cc_library_static { "src/runtime/experimental/operators/CpuActivation.cpp", "src/runtime/experimental/operators/CpuAdd.cpp", "src/runtime/experimental/operators/CpuDepthwiseConv2d.cpp", + "src/runtime/experimental/operators/CpuDequantize.cpp", "src/runtime/experimental/operators/CpuElementwise.cpp", + "src/runtime/experimental/operators/CpuGEMMLowp.cpp", "src/runtime/experimental/operators/CpuGemm.cpp", "src/runtime/experimental/operators/CpuGemmConv2d.cpp", "src/runtime/experimental/operators/CpuGemmDirectConv2d.cpp", "src/runtime/experimental/operators/CpuMul.cpp", + "src/runtime/experimental/operators/CpuQuantize.cpp", "src/runtime/experimental/operators/CpuSoftmax.cpp", "src/runtime/experimental/operators/CpuSub.cpp", "src/runtime/experimental/operators/CpuTranspose.cpp", diff --git a/CMakeLists.txt b/CMakeLists.txt index 321a83bfbb..5a31e61a76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute) project( ArmCompute - VERSION 42.0.0 + VERSION 43.0.0 DESCRIPTION "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures" LANGUAGES C CXX ASM) diff --git a/LICENSES/Apache-2.0.txt b/LICENSES/Apache-2.0.txt new file mode 100644 index 0000000000..e45f145de6 --- /dev/null +++ b/LICENSES/Apache-2.0.txt @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: 2008-2023 The Khronos Group Inc. +# +# SPDX-License-Identifier: Apache-2.0 + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/LICENSE b/LICENSES/MIT.txt similarity index 82% rename from LICENSE rename to LICENSES/MIT.txt index 781685ab31..ed43132fe0 100644 --- a/LICENSE +++ b/LICENSES/MIT.txt @@ -1,6 +1,11 @@ -MIT License +# SPDX-FileCopyrightText: 2012-2017 Christian Rau +# SPDX-FileCopyrightText: 2017 Leon Merten Lohse +# SPDX-FileCopyrightText: 2017 Sean Barrett +# SPDX-FileCopyrightText: 2017-2024 Arm Limited +# +# SPDX-License-Identifier: MIT -Copyright (c) 2017-2024 Arm Limited +MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 97ffe318c4..a5387961b4 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@

-# Compute Library ![](https://img.shields.io/badge/latest_release-24.09-green) +# Compute Library ![](https://img.shields.io/badge/latest_release-24.11-green) The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.
@@ -37,7 +37,7 @@ Key Features:
## Documentation -[![Documentation](https://img.shields.io/badge/documentation-24.09-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/index.xhtml) +[![Documentation](https://img.shields.io/badge/documentation-24.11-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/index.xhtml) > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc. @@ -50,22 +50,22 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C | Platform | Operating System | Release archive (Download) | | -------------- | ---------------- | -------------------------- | -| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-bin.tar.gz) | -| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) | -| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) | -| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) | +| Raspberry Pi 4 | Linux® 32bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-bin.tar.gz) | +| Raspberry Pi 4 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) | +| Odroid N2 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) | +| HiKey960 | Linux® 64bit | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) |
| Architecture | Operating System | Release archive (Download) | | ------------ | ---------------- | -------------------------- | -| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-gpu-bin.tar.gz) | -| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-android-aarch64-cpu-gpu-bin.tar.gz) | -| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) | +| armv7 | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-gpu-bin.tar.gz) | +| arm64-v8a | Android™ | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-android-aarch64-cpu-gpu-bin.tar.gz) | +| arm64-v8a | Linux® | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) |
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.09-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.09) +Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.11-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.11) Pre-build binaries are generated with the following security / good coding practices related flags: > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong @@ -107,13 +107,13 @@ Pre-build binaries are generated with the following security / good coding pract ## Experimental builds -**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/how_to_build.xhtml) for more details. +**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/how_to_build.xhtml) for more details.
## How to contribute -Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/contribution_guidelines.xhtml). +Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/contribution_guidelines.xhtml). ### Developer Certificate of Origin (DCO) Before the Compute Library accepts your contribution, you need to certify its origin and give us your permission. To manage this process we use the Developer Certificate of Origin (DCO) V1.1 (https://developercertificate.org/) diff --git a/SConscript b/SConscript index 2aff67d8ca..784db8edcb 100644 --- a/SConscript +++ b/SConscript @@ -33,8 +33,8 @@ import codecs import platform import SCons -VERSION = "v24.09" -LIBRARY_VERSION_MAJOR = 42 +VERSION = "v24.11" +LIBRARY_VERSION_MAJOR = 43 LIBRARY_VERSION_MINOR = 0 LIBRARY_VERSION_PATCH = 0 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH) @@ -627,12 +627,8 @@ custom_operators = [] custom_types = [] custom_layouts = [] -use_custom_ops = env['high_priority'] or env['build_config'] +use_custom_ops = env['build_config'] -if env['high_priority']: - custom_operators = filelist['high_priority'] - custom_types = ['all'] - custom_layouts = ['all'] if env['build_config']: custom_operators, custom_types, custom_layouts = read_build_config_json(env['build_config']) diff --git a/SConstruct b/SConstruct index c4bfef826d..8d7bd291e8 100644 --- a/SConstruct +++ b/SConstruct @@ -116,7 +116,6 @@ vars.AddVariables( PathVariable("build_dir", "Specify sub-folder for the build", ".", PathVariable.PathAccept), PathVariable("install_dir", "Specify sub-folder for the install", "", PathVariable.PathAccept), BoolVariable("exceptions", "Enable/disable C++ exception support", True), - BoolVariable("high_priority", "Generate a library containing only the high priority operators", False), PathVariable("linker_script", "Use an external linker script", "", PathVariable.PathAccept), PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure: EXTERNAL_TESTS_DIR: @@ -519,21 +518,11 @@ if not GetOption("help"): # Thus for backward compatibility, we include this flag only for NDK < r23 env.Append(CXXFLAGS = ['-no-integrated-as']) -if env['high_priority'] and env['build_config']: - print("The high priority library cannot be built in conjunction with a user-specified build configuration") - Exit(1) - -if not env['high_priority'] and not env['build_config']: - env.Append(CPPDEFINES = ['ARM_COMPUTE_GRAPH_ENABLED']) - data_types = [] data_layouts = [] # Set correct data types / layouts to build -if env['high_priority']: - data_types = ['all'] - data_layouts = ['all'] -elif env['build_config']: +if env['build_config']: data_types, data_layouts = read_build_config_json(env['build_config']) else: data_types = env['data_type_support'] @@ -613,7 +602,9 @@ else: env.Append(CXXFLAGS = ['-O3']) else: # on windows we use clang-cl which does not support the option -O3 - env.Append(CXXFLAGS = ['-O2']) + if not version_at_least(compiler_ver, '17.0.0'): + # Disable optimizations in clang 17 or later because the compiler crashes with -O2 + env.Append(CXXFLAGS = ['-O2']) if env['asserts']: env.Append(CPPDEFINES = ['ARM_COMPUTE_ASSERTS_ENABLED']) @@ -653,7 +644,7 @@ Export('version_at_least') SConscript('./SConscript', variant_dir=build_path, duplicate=0) -if env['examples'] and (env['build_config'] or env['high_priority']): +if env['examples'] and env['build_config']: print("WARNING: Building examples for selected operators not supported. Use examples=0") Return() @@ -664,7 +655,7 @@ if env['examples'] and env['exceptions']: SConscript('./examples/SConscript', variant_dir='%s/examples' % build_path, duplicate=0) if env['exceptions']: - if env['build_config'] or env['high_priority']: + if env['build_config']: print("WARNING: Building tests for selected operators not supported") Return() if env['os'] == 'bare_metal' and env['arch'] == 'armv7a': diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h index 7ee144e2cb..f4ad79e32c 100644 --- a/arm_compute/core/CPP/CPPTypes.h +++ b/arm_compute/core/CPP/CPPTypes.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" +#include #include namespace arm_compute @@ -180,7 +181,12 @@ class CPUInfo final * * @return Vector length if sme2 is enabled, otherwise returns 0. */ - uint64_t get_sme2_vector_length() const; + uint64_t get_sme2_vector_length_in_bytes() const; + /** Return the vector length in bits for sme2 + * + * @return Vector length if sme2 is enabled, otherwise returns 0. + */ + uint64_t get_sme2_vector_length_in_bits() const; private: struct Impl; diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h index aecba3712e..e8cc98f9e4 100644 --- a/arm_compute/core/QuantizationInfo.h +++ b/arm_compute/core/QuantizationInfo.h @@ -63,6 +63,31 @@ struct UniformQuantizationInfo int32_t offset; }; +/** Quantization info when assuming per layer quantization */ +struct UniformRequantizationInfo +{ + /** Default constructor */ + UniformRequantizationInfo() : scale(0.f), offset(0.f) + { + } + /** Constructor + * + * @param[in] scale Quantization scale + * @param[in] offset Quantization offset + */ + UniformRequantizationInfo(float scale, float offset) : scale(scale), offset(offset) + { + } + /** Checks if the scale and offset are both zero */ + bool empty() const + { + return (scale == 0) && (offset == 0); + } + + float scale; + float offset; +}; + /** Quantization information */ class QuantizationInfo { @@ -232,6 +257,13 @@ struct Qasymm8QuantizationHelper return static_cast(arm_compute::utility::clamp(quantized)); } + static inline QUANTIZED_TYPE quantize(float value, const UniformRequantizationInfo &qinfo) + { + ARM_COMPUTE_ERROR_ON(qinfo.scale == 0); + const int quantized = support::cpp11::lround(value / qinfo.scale + qinfo.offset); + return static_cast(arm_compute::utility::clamp(quantized)); + } + /** Quantize a value given a 8-bit asymmetric quantization scheme using a specific rounding policy * * @param[in] value Value to quantize @@ -253,6 +285,21 @@ struct Qasymm8QuantizationHelper return static_cast(arm_compute::utility::clamp(quantized)); } + static inline QUANTIZED_TYPE + quantize(float value, const UniformRequantizationInfo &qinfo, RoundingPolicy rounding_policy) + { + if (rounding_policy == RoundingPolicy::TO_NEAREST_UP) + { + return quantize(value, qinfo); + } + + ARM_COMPUTE_ERROR_ON(qinfo.scale == 0); + + // We round after adding the offset, because the offset is also float + const int quantized = arm_compute::round(value / qinfo.scale + qinfo.offset, rounding_policy); + return static_cast(arm_compute::utility::clamp(quantized)); + } + /** Quantize a value given a 8-bit asymmetric quantization scheme * * @param[in] value Value to quantize @@ -588,7 +635,11 @@ inline float dequantize_s32(int32_t value, const QuantizationInfo &qinfo) return dequantize_s32(value, qinfo.uniform()); } -/* +/** Compute the requantization offset and scale + * + * @deprecated because reequantization using integer offsets creates rounding issues. + * Please use @ref arm_compute::compute_requantization_scale_float_offset() instead. + * * In case of requantization of a quantized input tensor to an output tensor with another quantization * instead of applying dequantization and then a quantization functions, we just compute new scale and * offset. @@ -628,9 +679,32 @@ inline UniformQuantizationInfo compute_requantization_scale_offset(const Uniform // In order to minimize flooring we convert the offset to a float, // then compute the new offset in the float domain, // finally we convert it back as int32_t - offset_to_apply -= static_cast(static_cast(uqinfo_in.offset) * uqinfo_in.scale / uqinfo_out.scale); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP; +#endif //__aarch64__ + + offset_to_apply -= + arm_compute::round(static_cast(uqinfo_in.offset) * uqinfo_in.scale / uqinfo_out.scale, rounding_policy); return UniformQuantizationInfo(scale_to_apply, offset_to_apply); } +/** Similar to @ref arm_compute::compute_requantization_scale_offset() + * but returning offset as float instead of integer + */ +inline UniformRequantizationInfo compute_requantization_scale_float_offset(const UniformQuantizationInfo &uqinfo_in, + const UniformQuantizationInfo &uqinfo_out) +{ + float scale_to_apply = uqinfo_out.scale; + float offset_to_apply = static_cast(uqinfo_out.offset); + + scale_to_apply /= uqinfo_in.scale; + offset_to_apply -= static_cast(uqinfo_in.offset) * uqinfo_in.scale / uqinfo_out.scale; + + return UniformRequantizationInfo(scale_to_apply, offset_to_apply); +} + } // namespace arm_compute #endif // ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h index 6c93ff0c39..e4c9cbe879 100644 --- a/arm_compute/core/TensorInfo.h +++ b/arm_compute/core/TensorInfo.h @@ -327,6 +327,9 @@ class TensorInfo final : public ITensorInfo private: /** Calculates strides, offset and total size resulting from the specified padding around the XY plane. + * + * @note When interpreting the required_strides in the return value, only the values up to the corresponding dimension in the tensor is + * valid. For example, 1D tensor should only refer to 1D in required_strides, 2D tensor up to 2D in required_strides, and so on. * * @param[in] padding Padding around the XY plane in elements. */ diff --git a/arm_compute/core/utils/DataTypeUtils.h b/arm_compute/core/utils/DataTypeUtils.h index b19a3dd1e7..86adb761ac 100644 --- a/arm_compute/core/utils/DataTypeUtils.h +++ b/arm_compute/core/utils/DataTypeUtils.h @@ -69,6 +69,35 @@ inline size_t data_size_from_type(DataType data_type) } } +/** Get underlying data type + * + * @param[in] data_type Input data type + * + * @return the underlying data type + */ +inline constexpr DataType get_underlying_data_type(DataType data_type) +{ + switch (data_type) + { + case DataType::U8: + case DataType::QASYMM8: + return DataType::U8; + case DataType::S8: + case DataType::QSYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return DataType::S8; + case DataType::U16: + case DataType::QASYMM16: + return DataType::U16; + case DataType::S16: + case DataType::QSYMM16: + return DataType::S16; + default: + return data_type; + } +} + /** The size in bytes of the data type * * @param[in] dt Input data type diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h index 83b12d572e..575c1498ac 100644 --- a/arm_compute/function_info/ActivationLayerInfo.h +++ b/arm_compute/function_info/ActivationLayerInfo.h @@ -65,6 +65,7 @@ class ActivationLayerInfo /** Lookup table */ #ifdef __aarch64__ + // TODO (COMPMID-7511): delegate to LUTManager using LookupTable256 = std::array; using LookupTable65536 = std::array; #endif // __aarch64__ diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h index 0d8f53fe02..57c6408ef4 100644 --- a/arm_compute/runtime/CL/functions/CLCast.h +++ b/arm_compute/runtime/CL/functions/CLCast.h @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ + #ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCAST_H #define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCAST_H @@ -35,7 +36,7 @@ class CLCompileContext; class ICLTensor; class ITensorInfo; -/** Basic function to run opencl::kernels::ClCastKernel */ +/** Basic function to run type cast operation */ class CLCast : public IFunction { public: @@ -52,28 +53,36 @@ class CLCast : public IFunction /** Default move assignment operator */ CLCast &operator=(CLCast &&); /** Initialize the function's source, destination + * + * @note When casting from/to quantized types the scale and zeroPoint are ignored * * Valid data layouts: * - All * * Valid data type configurations: - * |src |dst | - * |:--------------|:--------------------------------------| - * |U8 | S8, U16, S16, U32, S32, F16, F32 | - * |S8 | U8, U16, S16, U32, S32, F16, F32 | - * |U16 | U8, S8, S16, U32, S32, F16, F32 | - * |S16 | U8, S8, U16, U32, S32, F16, F32 | - * |U32 | U8, S8, U16, S16, S32, F16, F32 | - * |S32 | U8, S8, U16, S16, U32, F16, F32 | - * |U64 | U8, S8, U16, S16, U32, S32, F16, F32 | - * |S64 | U8, S8, U16, S16, U32, S32, F16, F32 | - * |F16 | U8, S8, U16, S16, S32, U32, F32 | - * |F32 | U8, S8, U16, S16, S32, U32, F16 | + * |src |dst | + * |:------------------|:-----------------------------------------------------------------------------------------------| + * |U8 | S8, U16, S16, U32, S32, F16, F32, QASYMM8_SIGNED, QSYMM8, QSYMM8_PER_CHANNEL, 16-bit Quantized | + * |S8 | U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized | + * |U16 | U8, S8, S16, U32, S32, F16, F32, 8-bit Quantized, QSYMM16 | + * |S16 | U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QASYMM16 | + * |U32 | U8, S8, U16, S16, S32, F16, F32, All Quantized | + * |S32 | U8, S8, U16, S16, U32, F16, F32, All Quantized | + * |U64 | U8, S8, U16, S16, U32, S32, F16, F32, All Quantized | + * |S64 | U8, S8, U16, S16, U32, S32, F16, F32, All Quantized | + * |F16 | U8, S8, U16, S16, S32, U32, F32, All Quantized | + * |F32 | U8, S8, U16, S16, S32, U32, F16, All Quantized | + * |QASYMM8 | S8, U16, S16, U32, S32, F16, F32, QASYMM8_SIGNED, QSYMM8, QSYMM8_PER_CHANNEL, 16-bit Quantized | + * |QASYMM8_SIGNED | U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized | + * |QSYMM8 | U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized | + * |QSYMM8_PER_CHANNEL | U8, U16, S16, U32, S32, F16, F32, 16-bit Quantized | + * |QASYMM16 | U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QSYMM16 | + * |QSYMM16 | U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QASYMM16 | * * Input data type must be different than output data type. * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32. - * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] input The input tensor to convert. + * @param[out] output The output tensor. * @param[in] policy Conversion policy. */ void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy); @@ -82,14 +91,11 @@ class CLCast : public IFunction configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration of @ref CLCast * - * @param[in] input Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32. - * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. - * @param[in] policy Conversion policy. + * Similar to @ref CLCast::configure() * * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy); - // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h index c3ef2932f1..5025b5eaf4 100644 --- a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h +++ b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h @@ -29,6 +29,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/Tensor.h" namespace arm_compute @@ -40,7 +41,10 @@ class CPPBoxWithNonMaximaSuppressionLimit : public IFunction { public: /** Constructor */ - CPPBoxWithNonMaximaSuppressionLimit(std::shared_ptr memory_manager = nullptr); + CPPBoxWithNonMaximaSuppressionLimit(std::shared_ptr memory_manager); + CPPBoxWithNonMaximaSuppressionLimit() : CPPBoxWithNonMaximaSuppressionLimit(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ CPPBoxWithNonMaximaSuppressionLimit(const CPPBoxWithNonMaximaSuppressionLimit &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/MemoryGroup.h b/arm_compute/runtime/MemoryGroup.h index 93ea3d2c72..35da650857 100644 --- a/arm_compute/runtime/MemoryGroup.h +++ b/arm_compute/runtime/MemoryGroup.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,14 +21,16 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_MEMORYGROUP_H -#define ARM_COMPUTE_MEMORYGROUP_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_MEMORYGROUP_H +#define ACL_ARM_COMPUTE_RUNTIME_MEMORYGROUP_H #include "arm_compute/core/Error.h" #include "arm_compute/core/utils/misc/Macros.h" +#include "arm_compute/runtime/Allocator.h" #include "arm_compute/runtime/IMemoryGroup.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IMemoryPool.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include #include @@ -66,10 +68,11 @@ class MemoryGroup final : public IMemoryGroup std::shared_ptr _memory_manager; /**< Memory manager to be used by the group */ IMemoryPool *_pool; /**< Memory pool that the group is scheduled with */ MemoryMappings _mappings; /**< Memory mappings of the group */ + bool _auto_clear; /**< Whether the memory manager will be auto-cleared on release */ }; inline MemoryGroup::MemoryGroup(std::shared_ptr memory_manager) noexcept - : _memory_manager(memory_manager), _pool(nullptr), _mappings() + : _memory_manager(memory_manager), _pool(nullptr), _mappings(), _auto_clear(false) { } @@ -104,6 +107,17 @@ inline void MemoryGroup::acquire() if (!_mappings.empty()) { ARM_COMPUTE_ERROR_ON(!_memory_manager->pool_manager()); + // If the caller has not populated the underlying memory manager, + // do it here. Also set flag to auto-clear the memory manager on release. + // This is needed when using default memory managers that were not set up + // by the user. + if (_memory_manager->pool_manager()->num_pools() == 0) + { + Allocator alloc{}; + _memory_manager->populate(alloc, 1); + _auto_clear = true; + } + _pool = _memory_manager->pool_manager()->lock_pool(); _pool->acquire(_mappings); } @@ -118,6 +132,12 @@ inline void MemoryGroup::release() _pool->release(_mappings); _memory_manager->pool_manager()->unlock_pool(_pool); _pool = nullptr; + + if (_auto_clear) + { + _memory_manager->clear(); + _auto_clear = false; + } } } @@ -126,4 +146,4 @@ inline MemoryMappings &MemoryGroup::mappings() return _mappings; } } // namespace arm_compute -#endif /*ARM_COMPUTE_MEMORYGROUP_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_MEMORYGROUP_H diff --git a/arm_compute/runtime/MemoryManagerOnDemand.h b/arm_compute/runtime/MemoryManagerOnDemand.h index 7c31fe7f5a..0192f0b641 100644 --- a/arm_compute/runtime/MemoryManagerOnDemand.h +++ b/arm_compute/runtime/MemoryManagerOnDemand.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2019, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H -#define ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_MEMORYMANAGERONDEMAND_H +#define ACL_ARM_COMPUTE_RUNTIME_MEMORYMANAGERONDEMAND_H #include "arm_compute/runtime/ILifetimeManager.h" #include "arm_compute/runtime/IMemoryGroup.h" @@ -49,6 +49,8 @@ class MemoryManagerOnDemand : public IMemoryManager /** Allow instances of this class to be moved */ MemoryManagerOnDemand &operator=(MemoryManagerOnDemand &&) = default; + static std::shared_ptr make_default(); + // Inherited methods overridden: ILifetimeManager *lifetime_manager() override; IPoolManager *pool_manager() override; @@ -60,4 +62,4 @@ class MemoryManagerOnDemand : public IMemoryManager std::shared_ptr _pool_mgr; /**< Memory pool manager */ }; } // namespace arm_compute -#endif /*ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_MEMORYMANAGERONDEMAND_H diff --git a/arm_compute/runtime/MemoryRegion.h b/arm_compute/runtime/MemoryRegion.h index f8a4898281..4922edc2e1 100644 --- a/arm_compute/runtime/MemoryRegion.h +++ b/arm_compute/runtime/MemoryRegion.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,13 +21,14 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_RUNTIME_MEMORY_REGION_H -#define ARM_COMPUTE_RUNTIME_MEMORY_REGION_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_MEMORYREGION_H +#define ACL_ARM_COMPUTE_RUNTIME_MEMORYREGION_H #include "arm_compute/core/Error.h" #include "arm_compute/runtime/IMemoryRegion.h" #include +#include namespace arm_compute { @@ -100,4 +101,4 @@ class MemoryRegion final : public IMemoryRegion void *_ptr; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_RUNTIME_MEMORY_REGION_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_MEMORYREGION_H diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h index d58f7dda3e..88cedc6724 100644 --- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h +++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/INESimpleFunction.h" namespace arm_compute @@ -48,7 +49,10 @@ class NEArgMinMaxLayer : public IFunction { public: /** Constructor */ - NEArgMinMaxLayer(std::shared_ptr memory_manager = nullptr); + NEArgMinMaxLayer(std::shared_ptr memory_manager); + NEArgMinMaxLayer() : NEArgMinMaxLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEArgMinMaxLayer(const NEArgMinMaxLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index a6c0cfa7fa..bd049df4af 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -29,6 +29,7 @@ #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include @@ -73,7 +74,10 @@ class NEConvolutionLayer : public IFunction { public: /** Constructor */ - NEConvolutionLayer(std::shared_ptr memory_manager = nullptr); + NEConvolutionLayer(std::shared_ptr memory_manager); + NEConvolutionLayer() : NEConvolutionLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEConvolutionLayer(const NEConvolutionLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h index aabe42f928..0c6bb1d2bf 100644 --- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,14 +21,15 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEDECONVOLUTIONLAYER_H -#define ARM_COMPUTE_NEDECONVOLUTIONLAYER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDECONVOLUTIONLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDECONVOLUTIONLAYER_H #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CPP/functions/CPPUpsample.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" #include "arm_compute/runtime/NEON/functions/NEReverse.h" @@ -74,7 +75,10 @@ class NEDeconvolutionLayer : public IFunction { public: /** Constructor */ - NEDeconvolutionLayer(std::shared_ptr memory_manager = nullptr); + NEDeconvolutionLayer(std::shared_ptr memory_manager); + NEDeconvolutionLayer() : NEDeconvolutionLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDeconvolutionLayer(const NEDeconvolutionLayer &) = delete; /** Default move constructor */ @@ -166,4 +170,4 @@ class NEDeconvolutionLayer : public IFunction bool _do_upsampling; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEDECONVOLUTIONLAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDECONVOLUTIONLAYER_H diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index 1b0eb8fc3e..76250a6aee 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -26,6 +26,7 @@ #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEPermute.h" @@ -43,7 +44,10 @@ class NEDepthwiseConvolutionLayer : public IFunction { public: /** Default constructor */ - NEDepthwiseConvolutionLayer(std::shared_ptr memory_manager = nullptr); + NEDepthwiseConvolutionLayer(std::shared_ptr memory_manager); + NEDepthwiseConvolutionLayer() : NEDepthwiseConvolutionLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDepthwiseConvolutionLayer(const NEDepthwiseConvolutionLayer &) = delete; /** Default move constructor */ diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h index 7a94833d10..b789cc0579 100644 --- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,13 +21,14 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H -#define ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDETECTIONPOSTPROCESSLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDETECTIONPOSTPROCESSLAYER_H #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" #include "arm_compute/runtime/NEON/INESimpleFunction.h" #include "arm_compute/runtime/Tensor.h" @@ -47,7 +48,10 @@ class NEDetectionPostProcessLayer : public IFunction { public: /** Constructor */ - NEDetectionPostProcessLayer(std::shared_ptr memory_manager = nullptr); + NEDetectionPostProcessLayer(std::shared_ptr memory_manager); + NEDetectionPostProcessLayer() : NEDetectionPostProcessLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDetectionPostProcessLayer(const NEDetectionPostProcessLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -119,4 +123,4 @@ class NEDetectionPostProcessLayer : public IFunction bool _run_dequantize; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDETECTIONPOSTPROCESSLAYER_H diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h index a8d930d9ba..ce554d494e 100644 --- a/arm_compute/runtime/NEON/functions/NEFFT1D.h +++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h @@ -27,6 +27,7 @@ #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/Tensor.h" #include @@ -49,7 +50,10 @@ class NEFFT1D : public IFunction { public: /** Default Constructor */ - NEFFT1D(std::shared_ptr memory_manager = nullptr); + NEFFT1D(std::shared_ptr memory_manager); + NEFFT1D() : NEFFT1D(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEFFT1D(const NEFFT1D &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h index 84bfe6b02f..2b98562fe8 100644 --- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,11 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H -#define ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEFFTCONVOLUTIONLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEFFTCONVOLUTIONLAYER_H #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEFFT2D.h" @@ -59,7 +60,10 @@ class NEFFTConvolutionLayer : public IFunction { public: /** Default constructor */ - NEFFTConvolutionLayer(std::shared_ptr memory_manager = nullptr); + NEFFTConvolutionLayer(std::shared_ptr memory_manager); + NEFFTConvolutionLayer() : NEFFTConvolutionLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEFFTConvolutionLayer(const NEFFTConvolutionLayer &) = delete; /** Prevent instances of this class from being moved (As this class contains non movable objects) */ @@ -172,4 +176,4 @@ class NEFFTConvolutionLayer : public IFunction bool _is_prepared; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEFFTCONVOLUTIONLAYER_H diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h index a0c03af351..78d27c7376 100644 --- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h @@ -28,6 +28,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" #include "arm_compute/runtime/Tensor.h" @@ -87,8 +88,10 @@ class NEFullyConnectedLayer : public IFunction { public: /** Constructor */ - NEFullyConnectedLayer(std::shared_ptr memory_manager = nullptr, - IWeightsManager *weights_manager = nullptr); + NEFullyConnectedLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager = nullptr); + NEFullyConnectedLayer() : NEFullyConnectedLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete; /** Prevent instances of this class from being moved (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index 70493edd69..1f5e51e598 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -28,6 +28,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include @@ -41,7 +42,10 @@ class NEGEMM : public IFunction { public: /** Constructor */ - NEGEMM(std::shared_ptr memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); + NEGEMM(std::shared_ptr memory_manager, IWeightsManager *weights_manager = nullptr); + NEGEMM() : NEGEMM(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGEMM(const NEGEMM &) = delete; /** Default move constructor */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h index a4e2fce7c1..54def64d18 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h @@ -27,6 +27,7 @@ #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include @@ -50,7 +51,10 @@ class NEGEMMConv2d : public IFunction { public: /** Constructor */ - NEGEMMConv2d(const std::shared_ptr &memory_manager = nullptr); + NEGEMMConv2d(const std::shared_ptr &memory_manager); + NEGEMMConv2d() : NEGEMMConv2d(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGEMMConv2d(const NEGEMMConv2d &) = delete; /** Default move constructor */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index 7baa940f82..d37d49b48c 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -30,6 +30,7 @@ #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include @@ -47,8 +48,11 @@ class NEGEMMConvolutionLayer : public IFunction { public: /** Constructor */ - NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager = nullptr, + NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager, IWeightsManager *weights_manager = nullptr); + NEGEMMConvolutionLayer() : NEGEMMConvolutionLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete; /** Prevent instances of this class from being moved (As this class contains non movable objects) */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index c2d3089027..81c6115791 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -29,6 +29,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include @@ -47,8 +48,11 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction { public: /** Constructor */ - NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager = nullptr, + NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager, IWeightsManager *weights_manager = nullptr); + NEGEMMLowpMatrixMultiplyCore() : NEGEMMLowpMatrixMultiplyCore(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete; /** Default move constructor */ diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h index 0032d0c26d..4ee3cf9f3a 100644 --- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h @@ -29,6 +29,7 @@ #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h" #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEPadLayer.h" @@ -62,7 +63,10 @@ class NEGenerateProposalsLayer : public IFunction * * @param[in] memory_manager (Optional) Memory manager. */ - NEGenerateProposalsLayer(std::shared_ptr memory_manager = nullptr); + NEGenerateProposalsLayer(std::shared_ptr memory_manager); + NEGenerateProposalsLayer() : NEGenerateProposalsLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGenerateProposalsLayer(const NEGenerateProposalsLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h index b7ff9965db..a719f375fb 100644 --- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h @@ -27,6 +27,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEPermute.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/Tensor.h" @@ -47,7 +48,10 @@ class NEInstanceNormalizationLayer : public IFunction { public: /** Constructor */ - NEInstanceNormalizationLayer(std::shared_ptr memory_manager = nullptr); + NEInstanceNormalizationLayer(std::shared_ptr memory_manager); + NEInstanceNormalizationLayer() : NEInstanceNormalizationLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEInstanceNormalizationLayer(const NEInstanceNormalizationLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h index c29738d8e7..0e164ee43f 100644 --- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h +++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h @@ -27,6 +27,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/Tensor.h" @@ -47,7 +48,10 @@ class NEL2NormalizeLayer : public IFunction { public: /** Constructor */ - NEL2NormalizeLayer(std::shared_ptr memory_manager = nullptr); + NEL2NormalizeLayer(std::shared_ptr memory_manager); + NEL2NormalizeLayer() : NEL2NormalizeLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEL2NormalizeLayer(const NEL2NormalizeLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h index 8416111881..2c723840af 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/common/LSTMParams.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" @@ -47,7 +48,10 @@ class NELSTMLayer : public IFunction { public: /** Default constructor */ - NELSTMLayer(std::shared_ptr memory_manager = nullptr); + NELSTMLayer(std::shared_ptr memory_manager); + NELSTMLayer() : NELSTMLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NELSTMLayer(const NELSTMLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h index ae951669b3..72efde36ee 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,11 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NELSTMLAYERQUANTIZED_H -#define ARM_COMPUTE_NELSTMLAYERQUANTIZED_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NELSTMLAYERQUANTIZED_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NELSTMLAYERQUANTIZED_H #include "arm_compute/core/Types.h" #include "arm_compute/runtime/common/LSTMParams.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" @@ -63,7 +64,10 @@ class NELSTMLayerQuantized : public IFunction { public: /** Default constructor */ - NELSTMLayerQuantized(std::shared_ptr memory_manager = nullptr); + NELSTMLayerQuantized(std::shared_ptr memory_manager); + NELSTMLayerQuantized() : NELSTMLayerQuantized(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NELSTMLayerQuantized(const NELSTMLayerQuantized &) = delete; /** Prevent instances of this class from being moved (As this class contains pointers) */ @@ -233,4 +237,4 @@ class NELSTMLayerQuantized : public IFunction bool _is_prepared; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NELSTMLAYERQUANTIZED_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NELSTMLAYERQUANTIZED_H diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h index ae0e317d2e..6b80917716 100644 --- a/arm_compute/runtime/NEON/functions/NEMatMul.h +++ b/arm_compute/runtime/NEON/functions/NEMatMul.h @@ -27,6 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include @@ -78,7 +79,10 @@ class NEMatMul : public IFunction { public: /** Constructor */ - NEMatMul(); + NEMatMul(std::shared_ptr memory_manager); + NEMatMul() : NEMatMul(MemoryManagerOnDemand::make_default()) + { + } /** Destructor */ ~NEMatMul(); /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h index ce615c62e0..0f09f80c62 100644 --- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h @@ -28,6 +28,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/runtime/Tensor.h" @@ -49,7 +50,10 @@ class NENormalizationLayer : public IFunction { public: /** Default constructor */ - NENormalizationLayer(std::shared_ptr memory_manager = nullptr); + NENormalizationLayer(std::shared_ptr memory_manager); + NENormalizationLayer() : NENormalizationLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NENormalizationLayer(const NENormalizationLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h index ce8aeca790..eabcd32536 100644 --- a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include @@ -43,7 +44,10 @@ class NEPooling3dLayer : public IFunction { public: /** Constructor */ - NEPooling3dLayer(std::shared_ptr memory_manager = nullptr); + NEPooling3dLayer(std::shared_ptr memory_manager); + NEPooling3dLayer() : NEPooling3dLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEPooling3dLayer(const NEPooling3dLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h index 51f7d982f0..4aa1b205fc 100644 --- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h @@ -28,6 +28,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include @@ -45,7 +46,10 @@ class NEPoolingLayer : public IFunction { public: /** Constructor */ - NEPoolingLayer(std::shared_ptr memory_manager = nullptr); + NEPoolingLayer(std::shared_ptr memory_manager); + NEPoolingLayer() : NEPoolingLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEPoolingLayer(const NEPoolingLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h index af7f464ac9..8ba2a4911d 100644 --- a/arm_compute/runtime/NEON/functions/NERNNLayer.h +++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,10 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NERNNLAYER_H -#define ARM_COMPUTE_NERNNLAYER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NERNNLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NERNNLAYER_H #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NECopy.h" @@ -41,7 +42,10 @@ class NERNNLayer : public IFunction { public: /** Default constructor */ - NERNNLayer(std::shared_ptr memory_manager = nullptr); + NERNNLayer(std::shared_ptr memory_manager); + NERNNLayer() : NERNNLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NERNNLayer(const NERNNLayer &) = delete; /** Prevent instances of this class from being moved (As this class contains pointers) */ @@ -116,4 +120,4 @@ class NERNNLayer : public IFunction bool _is_prepared; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NERNNLAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NERNNLAYER_H diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h index 5b8d8cdf2b..eb02099b86 100644 --- a/arm_compute/runtime/NEON/functions/NEReduceMean.h +++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022 Arm Limited. + * Copyright (c) 2018-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,12 +21,13 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEON_REDUCE_MEAN_H -#define ARM_COMPUTE_NEON_REDUCE_MEAN_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREDUCEMEAN_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREDUCEMEAN_H #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/runtime/Tensor.h" @@ -38,7 +39,10 @@ class NEReduceMean : public IFunction { public: /** Constructor */ - NEReduceMean(std::shared_ptr memory_manager = nullptr); + NEReduceMean(std::shared_ptr memory_manager); + NEReduceMean() : NEReduceMean(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEReduceMean(const NEReduceMean &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -95,4 +99,4 @@ class NEReduceMean : public IFunction bool _keep_dims; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEON_REDUCE_MEAN_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREDUCEMEAN_H diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h index 8b56e17f65..c46d0e84e0 100644 --- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h +++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h @@ -25,6 +25,7 @@ #define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREDUCTIONOPERATION_H #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/runtime/Tensor.h" @@ -45,7 +46,10 @@ class NEReductionOperation : public IFunction { public: /** Default constructor */ - NEReductionOperation(std::shared_ptr memory_manager = nullptr); + NEReductionOperation(std::shared_ptr memory_manager); + NEReductionOperation() : NEReductionOperation(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEReductionOperation(const NEReductionOperation &) = delete; /** Default move constructor */ diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h index 1787de6237..276f5dc287 100644 --- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h +++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,12 +21,13 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NESOFTMAXLAYER_H -#define ARM_COMPUTE_NESOFTMAXLAYER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESOFTMAXLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESOFTMAXLAYER_H #include "arm_compute/core/Error.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include @@ -41,7 +42,10 @@ class NESoftmaxLayerGeneric : public IFunction { public: /** Constructor */ - NESoftmaxLayerGeneric(std::shared_ptr memory_manager = nullptr); + NESoftmaxLayerGeneric(std::shared_ptr memory_manager); + NESoftmaxLayerGeneric() : NESoftmaxLayerGeneric(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NESoftmaxLayerGeneric(const NESoftmaxLayerGeneric &) = delete; /** Default move constructor */ @@ -98,4 +102,4 @@ using NESoftmaxLayer = NESoftmaxLayerGeneric; using NELogSoftmaxLayer = NESoftmaxLayerGeneric; } // namespace arm_compute -#endif /* ARM_COMPUTE_NESOFTMAXLAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESOFTMAXLAYER_H diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h index 7b00fd3b9d..dce1e3e764 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h @@ -27,6 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/Tensor.h" #include @@ -49,7 +50,10 @@ class NEWinogradConvolutionLayer : public IFunction { public: /** Constructor */ - NEWinogradConvolutionLayer(const std::shared_ptr &memory_manager = nullptr); + NEWinogradConvolutionLayer(const std::shared_ptr &memory_manager); + NEWinogradConvolutionLayer() : NEWinogradConvolutionLayer(MemoryManagerOnDemand::make_default()) + { + } /** Prevent instances of this class from being copied (As this class contains pointers) */ NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete; /** Default move constructor */ diff --git a/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h b/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h index bc0e36cb0a..5958382f6c 100644 --- a/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h +++ b/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h @@ -111,6 +111,13 @@ class CpuGemmAssemblyDispatch : arm_compute::experimental::IOperator const GEMMInfo &gemm_info = GEMMInfo()); /** Indicates whether or not this function can be used to process the given parameters. + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:------------|:-----------|:---------|:--------------| + * |F32 |F32 |nullptr |F32 | + * |F16 |F16 |nullptr |F16 | + * |BFLOAT16 |BFLOAT16 |nullptr |BFLOAT16 | + * |BFLOAT16 |BFLOAT16 |nullptr |BFLOAT32 | * * @param[in] a Input tensor info (Matrix A) * @param[in] b Input tensor info (Matrix B) diff --git a/arm_compute/runtime/experimental/operators/CpuDequantize.h b/arm_compute/runtime/experimental/operators/CpuDequantize.h new file mode 100644 index 0000000000..90b3ebd107 --- /dev/null +++ b/arm_compute/runtime/experimental/operators/CpuDequantize.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017-2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUDEQUANTIZE_H +#define ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUDEQUANTIZE_H + +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/runtime/NEON/INEOperator.h" + +#include + +/* + * A shallow wrapper for arm_compute::cpu::CpuDequantize. + * Any new features should be added to arm_compute::cpu::CpuDequantize and + * arm_compute::experimental::op::CpuDequantize should remain a shallow wrapper. +*/ +namespace arm_compute +{ +namespace experimental +{ +namespace op +{ +/** A simple wrapper class which runs cpu::CpuDequantize that dequantizes an input tensor */ +class CpuDequantize : public INEOperator +{ +public: + /** Default Constructor */ + CpuDequantize(); + /** Default Destructor */ + ~CpuDequantize(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDequantize(const CpuDequantize &) = delete; + /** Default move constructor */ + CpuDequantize(CpuDequantize &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDequantize &operator=(const CpuDequantize &) = delete; + /** Default move assignment operator */ + CpuDequantize &operator=(CpuDequantize &&) = default; + /** Configure the kernel. + * + * Valid configurations and data layouts can be referenced in @ref arm_compute::NEDequantizationLayer. + */ + void configure(const ITensorInfo *input, ITensorInfo *output); + /** Static function to check if given info will lead to a valid configuration of @ref CpuDequantize + * + * Similar to @ref CpuDequantize::configure + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + struct Impl; + std::unique_ptr impl_; +}; +} // namespace op +} // namespace experimental +} // namespace arm_compute +#endif // ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUDEQUANTIZE_H diff --git a/arm_compute/runtime/experimental/operators/CpuGEMMLowp.h b/arm_compute/runtime/experimental/operators/CpuGEMMLowp.h new file mode 100644 index 0000000000..0ca7113d8f --- /dev/null +++ b/arm_compute/runtime/experimental/operators/CpuGEMMLowp.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUGEMMLOWP_H +#define ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUGEMMLOWP_H + +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/function_info/GEMMInfo.h" +#include "arm_compute/runtime/NEON/INEOperator.h" + +#include + +namespace arm_compute +{ +class ITensor; +class ITensorInfo; + +namespace experimental +{ +namespace op +{ +/* + * A shallow wrapper for arm_compute::cpu::CpuGemmLowpMatrixMultiplyCore. + * Any new features should be added to arm_compute::cpu::CpuGemmLowpMatrixMultiplyCore and + * arm_compute::experimental::op::CpuGEMMLowp should remain a shallow wrapper. +*/ +class CpuGEMMLowp : public INEOperator +{ +public: + /** Constructor */ + CpuGEMMLowp(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuGEMMLowp(const CpuGEMMLowp &) = delete; + /** Default move constructor */ + CpuGEMMLowp(CpuGEMMLowp &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuGEMMLowp &operator=(const CpuGEMMLowp &) = delete; + /** Default move assignment operator */ + CpuGEMMLowp &operator=(CpuGEMMLowp &&) = default; + /** Default destructor */ + ~CpuGEMMLowp(); + /** Initialise the kernel's inputs, output + * + *valid configurations can be referenced in @ref arm_compute::NEGEMMLowpMatrixMultiplyCore. + */ + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGEMMLowp + * + * Similar to @ref CpuGEMMLowp::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + + experimental::MemoryRequirements workspace() const override; + +private: + struct Impl; + std::unique_ptr _impl; +}; +} // namespace op +} // namespace experimental +} // namespace arm_compute +#endif // ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUGEMMLOWP_H diff --git a/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h b/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h index 2bbc7148d5..6cb539a6dc 100644 --- a/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h +++ b/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h @@ -137,6 +137,13 @@ class CpuGemmConv2d : public IOperator const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + /** Update of quantization information at the run stage for convolution so that the quantization multipliers can be properly calculated. + * Please @ref NEGEMMConvolutionLayer for a more in-depth explanation and example. + * + * @param[in] tensors Vector that contains the tensors to operate on. + */ + void update_quantization_parameters(ITensorPack &tensors); + void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; diff --git a/arm_compute/runtime/experimental/operators/CpuQuantize.h b/arm_compute/runtime/experimental/operators/CpuQuantize.h new file mode 100644 index 0000000000..962204ca36 --- /dev/null +++ b/arm_compute/runtime/experimental/operators/CpuQuantize.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2017-2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUQUANTIZE_H +#define ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUQUANTIZE_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/runtime/NEON/INEOperator.h" + +#include "src/cpu/ICpuOperator.h" + +#include + +/* + * A shallow wrapper for arm_compute::cpu::CpuQuantize. + * Any new features should be added to arm_compute::cpu::CpuQuantize and + * arm_compute::experimental::op::CpuQuantize should remain a shallow wrapper. +*/ +namespace arm_compute +{ +namespace experimental +{ +namespace op +{ + +/** A simple wrapper class which runs cpu::CpuQuantize */ +class CpuQuantize : public arm_compute::experimental::INEOperator +{ +public: + CpuQuantize(); + /** Default Destructor */ + ~CpuQuantize(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuQuantize(const CpuQuantize &) = delete; + /** Default move constructor */ + CpuQuantize(CpuQuantize &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuQuantize &operator=(const CpuQuantize &) = delete; + /** Default move assignment operator */ + CpuQuantize &operator=(CpuQuantize &&) = default; + /** Set the input and output tensors. + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------------------|:--------------------------------------| + * |QASYMM8 |QASYMM8, QASYMM8_SIGNED, QASYMM16 | + * |QASYMM8_SIGNED |QASYMM8, QASYMM8_SIGNED, QASYMM16 | + * |F16 |QASYMM8, QASYMM8_SIGNED, QASYMM16 | + * |F32 |QASYMM8, QASYMM8_SIGNED, QASYMM16 | + * + * @param[in] input Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. + * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16 + */ + void configure(const ITensorInfo *input, ITensorInfo *output); + /** Static function to check if given info will lead to a valid configuration of @ref CpuQuantize + * + * Similar to @ref CpuQuantize::configure() + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + struct Impl; + std::unique_ptr impl_; +}; +} // namespace op +} // namespace experimental +} // namespace arm_compute +#endif // ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUQUANTIZE_H diff --git a/docs/Doxyfile b/docs/Doxyfile index d92a65f340..0621168e94 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -60,7 +60,7 @@ PROJECT_NAME = "Compute Library" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 24.09 +PROJECT_NUMBER = 24.11 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/user_guide/library.dox b/docs/user_guide/library.dox index 1f97ccc458..371c2e1133 100644 --- a/docs/user_guide/library.dox +++ b/docs/user_guide/library.dox @@ -606,10 +606,5 @@ Supported data-types options are: The list of supported operators can be found in filelist.json in the root of Compute Library repo. -@subsection architecture_experimental_build_high_priority_operators Build high priority operators - -Selecting high_priority when building Compute Library, one new library will be created: libarm_compute_hp and -will contain a selected subset of the libary operators. Currently the operators are staticly set. - */ } // namespace arm_compute diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox index f423260fb5..47d872e0f1 100644 --- a/docs/user_guide/operator_list.dox +++ b/docs/user_guide/operator_list.dox @@ -455,16 +455,22 @@ where N = batches, C = channels, H = height, W = width, D = depth
srcdst -
U8S8, U16, S16, U32, S32, F16, F32 -
S8U8, U16, S16, U32, S32, F16, F32 -
U16U8, S8, S16, U32, S32, F16, F32 -
S16U8, S8, U16, U32, S32, F16, F32 -
U32U8, S8, U16, S16, S32, F16, F32 -
S32U8, S8, U16, S16, U32, F16, F32 -
U64U8, S8, U16, S16, U32, S32, F16, F32 -
S64U8, S8, U16, S16, U32, S32, F16, F32 -
F16U8, S8, U16, S16, S32, U32, F32 -
F32U8, S8, U16, S16, S32, U32, F16 +
U8S8, U16, S16, U32, S32, F16, F32, QASYMM8_SIGNED, QSYMM8, QSYMM8_PER_CHANNEL, 16-bit Quantized +
S8U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized +
U16U8, S8, S16, U32, S32, F16, F32, 8-bit Quantized, QSYMM16 +
S16U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QASYMM16 +
U32U8, S8, U16, S16, S32, F16, F32, All Quantized +
S32U8, S8, U16, S16, U32, F16, F32, All Quantized +
U64U8, S8, U16, S16, U32, S32, F16, F32, All Quantized +
S64U8, S8, U16, S16, U32, S32, F16, F32, All Quantized +
F16U8, S8, U16, S16, S32, U32, F32, All Quantized +
F32U8, S8, U16, S16, S32, U32, F16, All Quantized +
QASYMM8S8, U16, S16, U32, S32, F16, F32, QASYMM8_SIGNED, QSYMM8, QSYMM8_PER_CHANNEL, 16-bit Quantized +
QASYMM8_SIGNEDU8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized +
QSYMM8U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized +
QSYMM8_PER_CHANNELU8, U16, S16, U32, S32, F16, F32, 16-bit Quantized +
QASYMM16U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QSYMM16 +
QSYMM16U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QASYMM16
ChannelShuffleLayer diff --git a/filelist.json b/filelist.json index 5b49a68692..38cdff601d 100644 --- a/filelist.json +++ b/filelist.json @@ -109,15 +109,6 @@ "src/c/operators/AclActivation.cpp" ] }, - "high_priority": [ - "Activation", - "DepthwiseConv2d", - "Conv2d", - "Permute", - "Pool2d", - "Reshape", - "MatMul" - ], "gpu": { "common": [ "src/core/CL/CLCompileContext.cpp", @@ -1612,11 +1603,14 @@ "src/runtime/experimental/operators/CpuActivation.cpp", "src/runtime/experimental/operators/CpuAdd.cpp", "src/runtime/experimental/operators/CpuDepthwiseConv2d.cpp", + "src/runtime/experimental/operators/CpuDequantize.cpp", "src/runtime/experimental/operators/CpuElementwise.cpp", + "src/runtime/experimental/operators/CpuGEMMLowp.cpp", "src/runtime/experimental/operators/CpuGemm.cpp", "src/runtime/experimental/operators/CpuGemmConv2d.cpp", "src/runtime/experimental/operators/CpuGemmDirectConv2d.cpp", "src/runtime/experimental/operators/CpuMul.cpp", + "src/runtime/experimental/operators/CpuQuantize.cpp", "src/runtime/experimental/operators/CpuSoftmax.cpp", "src/runtime/experimental/operators/CpuSub.cpp", "src/runtime/experimental/operators/CpuTranspose.cpp", @@ -2312,7 +2306,7 @@ "qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"] }, "sve": { - "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ] + "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp", "src/cpu/kernels/softmax/generic/sve/impl_bf16.cpp" ] }, "sve2":{ "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"], diff --git a/scripts/format_code.py b/scripts/format_code.py index 8bfb3f5601..6da63fff81 100755 --- a/scripts/format_code.py +++ b/scripts/format_code.py @@ -142,20 +142,20 @@ def check_license(filename): f.close() f = open(filename, "w") - f.write("".join(content[:2])) + f.write("".join(content[:3])) year = datetime.datetime.now().year # This only works until year 9999 - m = re.match(r"(.*Copyright \(c\) )(.*\d{4})( [Arm|ARM].*)", content[2]) + m = re.match(r"(.*FileCopyrightText: )(.*\d{4})( [arm|Arm|ARM].*)", content[3]) if not m: - f.write("Copyright (c) {} Arm Limited\n".format(year)) + f.write("# SPDX-FileCopyrightText: {} Arm Limited\n#\n".format(year)) else: updated_year = adjust_copyright_year(m.group(2), year) - f.write("Copyright (c) {} Arm Limited\n".format(updated_year)) + f.write("# SPDX-FileCopyrightText: {} Arm Limited\n".format(updated_year)) # Copy the rest of the file's content: - f.write("".join(content[3:])) + f.write("".join(content[4:])) f.close() @@ -276,7 +276,7 @@ def run(self): logger.info("Formatting %s" % f) - check_license("LICENSE") + check_license("LICENSES/MIT.txt") except subprocess.CalledProcessError as e: retval = -1 diff --git a/src/BUILD.bazel b/src/BUILD.bazel index 4aa157efd5..ed869de9aa 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel @@ -358,7 +358,8 @@ filegroup( "cpu/kernels/scale/sve/integer.cpp", "cpu/kernels/scale/sve/qasymm8.cpp", "cpu/kernels/scale/sve/qasymm8_signed.cpp", - "cpu/kernels/softmax/generic/sve/impl.cpp"] + + "cpu/kernels/softmax/generic/sve/impl.cpp", + "cpu/kernels/softmax/generic/sve/impl_bf16.cpp"] + glob(["**/*.h", "**/*.hpp", "**/*.inl"]), @@ -1042,11 +1043,14 @@ filegroup( "runtime/experimental/operators/CpuActivation.cpp", "runtime/experimental/operators/CpuAdd.cpp", "runtime/experimental/operators/CpuDepthwiseConv2d.cpp", + "runtime/experimental/operators/CpuDequantize.cpp", "runtime/experimental/operators/CpuElementwise.cpp", + "runtime/experimental/operators/CpuGEMMLowp.cpp", "runtime/experimental/operators/CpuGemm.cpp", "runtime/experimental/operators/CpuGemmConv2d.cpp", "runtime/experimental/operators/CpuGemmDirectConv2d.cpp", "runtime/experimental/operators/CpuMul.cpp", + "runtime/experimental/operators/CpuQuantize.cpp", "runtime/experimental/operators/CpuSoftmax.cpp", "runtime/experimental/operators/CpuSub.cpp", "runtime/experimental/operators/CpuTranspose.cpp", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 58eca30847..e8ae6705ac 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -328,6 +328,7 @@ target_sources( cpu/kernels/scale/sve/qasymm8.cpp cpu/kernels/scale/sve/qasymm8_signed.cpp cpu/kernels/softmax/generic/sve/impl.cpp + cpu/kernels/softmax/generic/sve/impl_bf16.cpp ) target_sources( @@ -1033,11 +1034,14 @@ target_sources( runtime/experimental/operators/CpuActivation.cpp runtime/experimental/operators/CpuAdd.cpp runtime/experimental/operators/CpuDepthwiseConv2d.cpp + runtime/experimental/operators/CpuDequantize.cpp runtime/experimental/operators/CpuElementwise.cpp + runtime/experimental/operators/CpuGEMMLowp.cpp runtime/experimental/operators/CpuGemm.cpp runtime/experimental/operators/CpuGemmConv2d.cpp runtime/experimental/operators/CpuGemmDirectConv2d.cpp runtime/experimental/operators/CpuMul.cpp + runtime/experimental/operators/CpuQuantize.cpp runtime/experimental/operators/CpuSoftmax.cpp runtime/experimental/operators/CpuSub.cpp runtime/experimental/operators/CpuTranspose.cpp diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp index 2352e27a17..09e220e75e 100644 --- a/src/common/cpuinfo/CpuInfo.cpp +++ b/src/common/cpuinfo/CpuInfo.cpp @@ -417,9 +417,12 @@ CpuInfo CpuInfo::build() #elif defined(__aarch64__) && defined(_WIN64) /* #elif defined(__aarch64__) && defined(__APPLE__) */ CpuIsaInfo isainfo; isainfo.neon = true; - if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) + isainfo.dot = IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE); + if (NTDDI_VERSION >= NTDDI_WIN11_GE) { - isainfo.dot = true; + isainfo.fp16 = IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE); + isainfo.sve = IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE); + isainfo.i8mm = IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE); } SYSTEM_INFO sysinfo; GetSystemInfo(&sysinfo); diff --git a/src/core/CL/cl_kernels/common/cast.cl b/src/core/CL/cl_kernels/common/cast.cl index 036a683ec7..e2de6dd8eb 100644 --- a/src/core/CL/cl_kernels/common/cast.cl +++ b/src/core/CL/cl_kernels/common/cast.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -70,19 +70,14 @@ __kernel void cast_down( VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr); -#if defined(IS_DATA_TYPE_QUANTIZED) - in_data ^= (VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE))0x80; -#endif // defined(IS_DATA_TYPE_QUANTIZED) +#if defined(QSYMM8_PER_CHANNEL_TO_QASYMM8) + // This operation mode is used in Gemmlowp + in_data ^= (VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)) 0x80; +#endif // defined(QSYMM8_PER_CHANNEL_TO_QASYMM8) -#if defined(IS_DATA_TYPE_FLOAT) VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) -#else /* defined(IS_DATA_TYPE_FLOAT) */ - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) - res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); - STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) -#endif /* defined(IS_DATA_TYPE_FLOAT) */ } /** This function performs a up-casting @@ -122,13 +117,7 @@ __kernel void cast_up( VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr); -#if defined(IS_DATA_TYPE_FLOAT) - VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) - res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); - STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) -#else /* defined(IS_DATA_TYPE_FLOAT) */ VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) -#endif /* defined(IS_DATA_TYPE_FLOAT) */ } diff --git a/src/core/CL/cl_kernels/common/quantization_layer.cl b/src/core/CL/cl_kernels/common/quantization_layer.cl index 69cc288c25..072a9721c4 100644 --- a/src/core/CL/cl_kernels/common/quantization_layer.cl +++ b/src/core/CL/cl_kernels/common/quantization_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -81,6 +81,10 @@ __kernel void quantization_layer( // Create scale and offset vectors const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE; const VEC_DATA_TYPE(int, VEC_SIZE) voffset = OFFSET; + + // Quantize + VEC_DATA_TYPE(int, VEC_SIZE) + res = CLAMP(CONVERT_RTE_VEC(val_float / vscale, int, VEC_SIZE) + voffset, MIN_QUANT_VAL, MAX_QUANT_VAL); #else // defined(IS_FLOAT) // Load data VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) @@ -91,18 +95,25 @@ __kernel void quantization_layer( // Create scale and offset vectors const VEC_DATA_TYPE(float, VEC_SIZE) vscale = SCALE; - const VEC_DATA_TYPE(int, VEC_SIZE) voffset = OFFSET; -#endif // defined(IS_FLOAT) + const VEC_DATA_TYPE(float, VEC_SIZE) voffset = OFFSET; // Quantize VEC_DATA_TYPE(int, VEC_SIZE) - res = CLAMP(CONVERT_RTE_VEC(val_float / vscale, int, VEC_SIZE) + voffset, MIN_QUANT_VAL, MAX_QUANT_VAL); + res = CLAMP(CONVERT_RTE_VEC(val_float / vscale + voffset, int, VEC_SIZE), MIN_QUANT_VAL, MAX_QUANT_VAL); +#endif // defined(IS_FLOAT) // Store result VSTORE(VEC_SIZE) (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); #else //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) + + // Each thread computes a single element +#if defined(IS_FLOAT) *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE_IN *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, MIN_QUANT_VAL, MAX_QUANT_VAL); +#else // !defined(IS_FLOAT) + *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE_IN *)input.ptr) / ((float)SCALE) + (float)OFFSET, int), MIN_QUANT_VAL, MAX_QUANT_VAL); +#endif // defined(IS_FLOAT) + #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) } #endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL) diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl index bfc0995bb8..60258938d1 100644 --- a/src/core/CL/cl_kernels/common/softmax_layer.cl +++ b/src/core/CL/cl_kernels/common/softmax_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -169,7 +169,7 @@ __kernel void softmax_x( // Normalize the data. #ifdef IS_QUANTIZED # if IS_LOG - TMP_DATA_TYPE norm_offset = -log(sum_value) + DST_OFFSET; + TMP_DATA_TYPE norm_offset = -log(sum_value) / DST_SCALE + DST_OFFSET; # define NORMALIZE(SIZE, x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, SIZE), rte) # else // IS_LOG TMP_DATA_TYPE norm_div = sum_value * DST_SCALE; @@ -333,7 +333,7 @@ __kernel void softmax_non_x( // Normalize the data. #ifdef IS_QUANTIZED # if IS_LOG - VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_offset = -log(sum_value) + DST_OFFSET; + VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_offset = -log(sum_value) / DST_SCALE + DST_OFFSET; # define NORMALIZE(x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE), rte) # else // IS_LOG VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_div = sum_value * DST_SCALE; diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp index a6d08e5bad..79adb04a09 100644 --- a/src/core/CPP/CPPTypes.cpp +++ b/src/core/CPP/CPPTypes.cpp @@ -137,7 +137,7 @@ unsigned int CPUInfo::get_L2_cache_size() const return _impl->L2_cache_size; } -uint64_t CPUInfo::get_sme2_vector_length() const +uint64_t CPUInfo::get_sme2_vector_length_in_bytes() const { #ifdef ARM_COMPUTE_ENABLE_SME2 if (this->has_sme2()) @@ -148,6 +148,12 @@ uint64_t CPUInfo::get_sme2_vector_length() const return 0; #endif // ARM_COMPUTE_ENABLE_SME2 } + +uint64_t CPUInfo::get_sme2_vector_length_in_bits() const +{ + return get_sme2_vector_length_in_bytes() * 8; +} + unsigned int CPUInfo::get_cpu_num_excluding_little() const { #if defined(__ANDROID__) diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h index b93e64a0ef..522369309b 100644 --- a/src/core/NEON/NEAsymm.h +++ b/src/core/NEON/NEAsymm.h @@ -651,6 +651,26 @@ inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int3 return rf; } +inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, float offset) +{ + const float32x4_t voffset = vdupq_n_f32(offset); + const float32x4_t vinvscale = vdupq_n_f32(1.f / scale); + const int32x4x4_t rf = {{ +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)), + vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)), +#endif //__aarch64__ + }}; + return rf; +} + /** Quantize a neon vector holding 16 floating point values. * * @param[in] qv Input values to be quantized. @@ -666,6 +686,14 @@ inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationIn return vcombine_u8(pa, pb); } +inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformRequantizationInfo &qi) +{ + auto rf = vquantize_internal(qv, qi.scale, qi.offset); + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_u8(pa, pb); +} + /** Signed quantize a neon vector holding 16 floating point values. * * @param[in] qv Input values to be quantized. @@ -681,6 +709,14 @@ inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantiza return vcombine_s8(pa, pb); } +inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformRequantizationInfo &qi) +{ + auto rf = vquantize_internal(qv, qi.scale, qi.offset); + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_s8(pa, pb); +} + /** Quantize to QASYMM16 a neon vector holding 16 floating point values. * * @param[in] qv Input values to be quantized. diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp index 1e4de4a39e..04aa63019f 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp @@ -45,6 +45,7 @@ GemmImplementation::with_estimate( [](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); } ), +#ifdef ARM_COMPUTE_ENABLE_SVE GemmImplementation::with_estimate( GemmMethod::GEMM_INTERLEAVED, "sve_ffinterleaved_bf16fp32_mmla_8x3VL", @@ -53,6 +54,7 @@ GemmImplementation::with_estimate( [](const GemmArgs &args) { return GemmInterleavedFixedFormat::estimate_cycles(args); }, [](const GemmArgs &args) { return new GemmInterleavedFixedFormat(args); } ), +#endif // ARM_COMPUTE_ENABLE_SVE #endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS #endif // ARM_COMPUTE_ENABLE_BF16 #endif // __aarch64__ diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp index d0a8635604..39fd653a6a 100644 --- a/src/core/NEON/kernels/arm_gemm/utils.hpp +++ b/src/core/NEON/kernels/arm_gemm/utils.hpp @@ -172,10 +172,12 @@ namespace utils { // get_vector_length(): Returns SVE vector length for type "T". // // It is required that this can be compiled by a compiler in non-SVE mode, but it must be prevented from running (at -// runtime) if SVE is not enabled. Typically this is used by switchyard/driver code which is built in normal mode +// runtime) if SVE is not enabled. Typically this is used by switchyard/driver code which is built in normal mode // which then calls SVE kernels (compiled accordingly) iff SVE is detected at runtime. template inline unsigned long get_vector_length() { +// x0 register is not available in 32-bit builds +#if defined(__aarch64__) uint64_t vl; __asm __volatile ( @@ -185,10 +187,13 @@ inline unsigned long get_vector_length() { : : "x0" ); - return vl / sizeof(T); +#else // !defined(__aarch64__) + return 16 / sizeof(T); +#endif // defined(__aarch64__) } +#ifdef __aarch64__ namespace sme { template @@ -207,6 +212,7 @@ inline uint64_t get_vector_length() { } } // namespace sme +#endif // __aarch64__ // get_vector_length(VLType): Returns vector length for type "T". // diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h index 381de2284a..4b9e110b87 100644 --- a/src/core/NEON/wrapper/intrinsics/cvt.h +++ b/src/core/NEON/wrapper/intrinsics/cvt.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2022-2023 Arm Limited. + * Copyright (c) 2020, 2022-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_WRAPPER_CVT_H -#define ARM_COMPUTE_WRAPPER_CVT_H +#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_CVT_H +#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_CVT_H #include @@ -82,6 +82,18 @@ inline typename std::enable_if::value, int32x4_t>::type { return vcvtaq_s32_f32(a); } + +template +inline typename std::enable_if::value, uint32x4_t>::type vcvtn(const float32x4_t &a) +{ + return vcvtnq_u32_f32(a); +} + +template +inline typename std::enable_if::value, int32x4_t>::type vcvtn(const float32x4_t &a) +{ + return vcvtnq_s32_f32(a); +} #endif //__aarch64__ #if defined(ARM_COMPUTE_ENABLE_BF16) @@ -104,4 +116,4 @@ inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr) } // namespace wrapper } // namespace arm_compute -#endif /* ARM_COMPUTE_WRAPPER_CVT_H */ +#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_CVT_H diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h index cd849c3666..5278a11db4 100644 --- a/src/core/common/Registrars.h +++ b/src/core/common/Registrars.h @@ -207,8 +207,14 @@ #if defined(ARM_COMPUTE_ENABLE_BF16) #define REGISTER_BF16_NEON(func_name) &(func_name) -#else /* !(defined(ARM_COMPUTE_ENABLE_BF16))*/ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#define REGISTER_BF16_SVE(func_name) &(func_name) +#endif /* !defined(ARM_COMPUTE_ENABLE_SVE)*/ +#else /* !(defined(ARM_COMPUTE_ENABLE_BF16))*/ #define REGISTER_BF16_NEON(func_name) nullptr +#if defined(ARM_COMPUTE_ENABLE_SVE) +#define REGISTER_BF16_SVE(func_name) nullptr +#endif /* !defined(ARM_COMPUTE_ENABLE_SVE)*/ #endif /* defined(ARM_COMPUTE_ENABLE_BF16)*/ #endif // ACL_SRC_CORE_COMMON_REGISTRARS_H diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp index 62ad2bab6d..5c9547af53 100644 --- a/src/core/helpers/LUTManager.cpp +++ b/src/core/helpers/LUTManager.cpp @@ -59,16 +59,35 @@ inline float16_t activation(float16_t x, const LUTInfo &info) return out; } +inline float exponential(float fp, const LUTInfo &info) +{ + return std::exp(fp * info.beta); +} + // Read bf16 value as u16, convert to fp32. // Calculate exp in fp32, return as bf16 -inline uint16_t exponential(uint16_t x, const LUTInfo &info) +inline uint16_t exponential_bf16(uint16_t x, const LUTInfo &info) { float fp = bf16_to_float(x); - fp = std::exp(fp * info.beta * -1); + fp = exponential(fp, info); return float_to_bf16(fp); } -void init_lut_16bit(LookupTable65536 *lut, const LUTInfo &info) +void init_lut(LookupTable256 &lut, const LUTInfo &info) +{ + // assert lut is valid config. + ARM_COMPUTE_ASSERT((info.type == LUTType::Exponential && info.dt == DataType::QASYMM8) || + (info.type == LUTType::Exponential && info.dt == DataType::QASYMM8_SIGNED)); + + for (int i = 0; i < 256; ++i) + { + const float deq = info.dt == DataType::QASYMM8 ? dequantize_qasymm8(i, info.qinfo) + : dequantize_qasymm8_signed(i - 128, info.qinfo); + lut[i] = exponential(deq, info); + } +} + +void init_lut(LookupTable65536 &lut, const LUTInfo &info) { // assert lut is valid config. ARM_COMPUTE_ASSERT((info.type == LUTType::Activation && info.dt == DataType::F16) || @@ -82,13 +101,13 @@ void init_lut_16bit(LookupTable65536 *lut, const LUTInfo &info) { case LUTType::Activation: { - (*lut)[item.i] = activation(item.fp, info); + lut[item.i] = activation(item.fp, info); break; } case LUTType::Exponential: { - bf16.i = exponential(item.i, info); - (*lut)[item.i] = bf16.fp; + bf16.i = exponential_bf16(item.i, info); + lut[item.i] = bf16.fp; break; } default: @@ -103,10 +122,24 @@ void init_lut_16bit(LookupTable65536 *lut, const LUTInfo &info) } // namespace -std::shared_ptr LUTManager::get_lut_table(LUTInfo info) +template <> +inline std::map> &LUTManager::get_map() +{ + return map_fp32; +} + +template <> +inline std::map> &LUTManager::get_map() { - const auto itr = map_fp16.find(info); - auto s_ptr = (itr != map_fp16.end()) ? itr->second.lock() : nullptr; // nullptr if invalid or not found. + return map_fp16; +} + +template +std::shared_ptr LUTManager::get_lut_table(LUTInfo info) +{ + auto &map = get_map(); + const auto itr = map.find(info); + auto s_ptr = (itr != map.end()) ? itr->second.lock() : nullptr; // nullptr if invalid or not found. if (s_ptr != nullptr) { // Found and valid @@ -116,12 +149,15 @@ std::shared_ptr LUTManager::get_lut_table(LUTInfo info) { // Not found, or pointer not valid // We do not use make_shared to prevent the weak_ptr keeping the control block alive - std::shared_ptr ptr(new LookupTable65536); - init_lut_16bit(ptr.get(), info); - map_fp16[info] = ptr; + std::shared_ptr ptr(new T); + init_lut(*ptr, info); + map[info] = ptr; return ptr; } } + +template std::shared_ptr LUTManager::get_lut_table(LUTInfo info); +template std::shared_ptr LUTManager::get_lut_table(LUTInfo info); #endif // __aarch64__ // Static function to get LutManager instance diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h index 226f44f360..eca9472f41 100644 --- a/src/core/helpers/LUTManager.h +++ b/src/core/helpers/LUTManager.h @@ -35,14 +35,14 @@ namespace arm_compute { #ifdef __aarch64__ -using LookupTable256 = std::array; +using LookupTable256 = std::array; using LookupTable65536 = std::array; #endif // __aarch64__ enum class LUTType { Activation, // Determined by activation type - Exponential, // e^x + Exponential, // e^(beta * x) }; struct LUTInfo @@ -76,7 +76,7 @@ struct LUTInfo ActivationLayerInfo::ActivationFunction act; float alpha; float beta; - DataType dt; + DataType dt; // What datatype the table is indexed with. UniformQuantizationInfo qinfo; LUTType type; // Default is Activation. }; @@ -89,9 +89,14 @@ class LUTManager static LUTManager &get_instance(); #ifdef __aarch64__ - std::shared_ptr get_lut_table(LUTInfo info); + template + std::shared_ptr get_lut_table(LUTInfo info); private: + template + inline std::map> &get_map(); + + std::map> map_fp32{}; std::map> map_fp16{}; #endif // __aarch64__ }; diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp index c02691d5db..08901437a1 100644 --- a/src/cpu/kernels/CpuActivationKernel.cpp +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -53,6 +53,12 @@ static const std::array qasymm8_acti ActivationLayerInfo::ActivationFunction::TANH, ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LEAKY_RELU, ActivationLayerInfo::ActivationFunction::GELU, }; + +/* Static quantization can only, currently, support relu based activations */ +static const std::array qasymm8_static_quant_activations = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + /* Supported activation in the 16-bit integer domain */ static const std::array qsymm16_activations = { ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH, @@ -72,6 +78,12 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_data_type_quantized_asymmetric_char(data_type) && oq_info.is_dynamic() && + (std::find(std::begin(qasymm8_static_quant_activations), std::end(qasymm8_static_quant_activations), + f_act) == std::end(qasymm8_static_quant_activations)), + "For QASYMM8 statically quantized, only relu and lower/upper bounded relu are supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == @@ -114,6 +126,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const } #ifdef __aarch64__ +// TODO (COMPMID-7511): delegate to LUTManager void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_type, const UniformQuantizationInfo &qi_in, @@ -208,6 +221,7 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac // Initialise lut_manager LUTManager &lut_manager = LUTManager::get_instance(); + // TODO (COMPMID-7511): delegate to LUTManager if ((src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) && activation_info.activation() != ActivationFunction::RELU) { @@ -223,7 +237,7 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac // Create info using init list. const LUTInfo info = {activation_info.activation(), activation_info.a(), activation_info.b(), src->data_type(), src->quantization_info().uniform()}; - activation_info.setLookupTable65536((lut_manager.get_lut_table(info))); + activation_info.setLookupTable65536((lut_manager.get_lut_table(info))); } #endif // __aarch64__ _act_info = activation_info; diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp index 05c7742b03..b4d44cb5bc 100644 --- a/src/cpu/kernels/CpuCastKernel.cpp +++ b/src/cpu/kernels/CpuCastKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2023 Arm Limited. + * Copyright (c) 2016-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -893,7 +893,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::QASYMM8: case DataType::U8: { - /* Down-conversion F32 -> U8 */ + /* Down-conversion F32 -> QASYMM8, U8 */ execute_window_loop( win, [&](const Coordinates &) @@ -922,7 +922,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr // Compute left-over elements for (; x < window_end_x; ++x) { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + *(dst_ptr + x) = utils::cast::saturate_static_cast(*(src_ptr + x)); } }, src, dst); @@ -958,7 +958,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr // Compute left-over elements for (; x < window_end_x; ++x) { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + *(dst_ptr + x) = utils::cast::saturate_static_cast(*(src_ptr + x)); } }, src, dst); diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp index 39ba764c78..17e455b3e3 100644 --- a/src/cpu/kernels/CpuIm2ColKernel.cpp +++ b/src/cpu/kernels/CpuIm2ColKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -306,7 +306,7 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, _kernel_height = kernel_dims.height; _input_pad_right = input_pad_right; _dilation = dilation; - _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width, + _convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), _kernel_width, _kernel_height, _conv_info, _dilation); _has_bias = has_bias; diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp index b7e395fb79..b9eb0fcb20 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp @@ -34,6 +34,7 @@ #include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/LUTManager.h" #include "src/core/helpers/Utils.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/softmax/list.h" @@ -51,6 +52,14 @@ namespace /* Softmax */ static const std::vector available_kernels = { +#if defined(ARM_COMPUTE_ENABLE_BF16) +#if defined(ARM_COMPUTE_ENABLE_SVE) + {"sve_bf16_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::BFLOAT16 && data.isa.sve && data.axis == 0); }, + REGISTER_BF16_SVE(sve_softmax_bf16)}, +#endif // defined(ARM_COMPUTE_ENABLE_SVE) +#endif // defined(ARM_COMPUTE_ENABLE_BF16) {"sme2_fp32_softmax", [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32 && data.isa.sme2 && data.axis == 0); }, @@ -103,28 +112,6 @@ static const std::vector available_ker REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)}, }; -void init_lut(std::vector &lut, DataType type, float scale, float beta) -{ - if (type == DataType::QASYMM8) - { - for (int i = 0; i < 256; ++i) - { - lut.push_back(std::exp(-scale * beta * i)); - } - } - else if (type == DataType::QASYMM8_SIGNED) - { - for (int i = -128; i < 128; ++i) - { - lut.push_back(std::exp(-scale * beta * i)); - } - } - else - { - ARM_COMPUTE_ERROR("Invalid datatype for QASYMM8/QASYMM8_SIGNED softmax"); - } -} - Status validate_arguments_softmax( const ITensorInfo &src, const ITensorInfo &dst, float beta, int axis, const ITensorInfo &tmp, bool is_log) { @@ -132,7 +119,7 @@ Status validate_arguments_softmax( // Check input ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::F16, DataType::F32); + DataType::F16, DataType::F32, DataType::BFLOAT16); ARM_COMPUTE_RETURN_ERROR_ON(axis < 0 || axis > 3); @@ -195,7 +182,7 @@ void CpuSoftmaxKernel::configure( } const auto *uk = CpuSoftmaxKernel::get_implementation(SoftmaxKernelDataTypeISASelectorData{ - src->data_type(), CPUInfo::get().get_isa(), is_log, axis, CPUInfo::get().get_sme2_vector_length()}); + src->data_type(), CPUInfo::get().get_isa(), is_log, axis, CPUInfo::get().get_sme2_vector_length_in_bits()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel"); @@ -232,12 +219,27 @@ void CpuSoftmaxKernel::configure( ICpuKernel::configure(win); +#ifdef __aarch64__ const std::string uk_name = uk->name; + + if (src->data_type() == DataType::BFLOAT16) + { + LUTManager &lutmanager = LUTManager::get_instance(); + LUTInfo info = {LUTType::Exponential, beta, DataType::BFLOAT16, UniformQuantizationInfo()}; + _lut_bf16 = lutmanager.get_lut_table(info); + } + if (uk_name == "sme2_qu8_softmax_lut_512VL" || uk_name == "sme2_qs8_softmax_lut_512VL") { - const float scale = src->quantization_info().uniform().scale; - init_lut(_lut, src->data_type(), scale, beta); + UniformQuantizationInfo qinfo = src->quantization_info().uniform(); + // What the ukernel is interested in looking up is exp(b * deq(q)). The + // quantization offset cancels out in softmax so we don't need it in + // the LUT. + qinfo.offset = 0; + const LUTInfo info{LUTType::Exponential, -beta, src->data_type(), qinfo}; + _lut = LUTManager::get_instance().get_lut_table(info); } +#endif // __aarch64__ } Status CpuSoftmaxKernel::validate( @@ -274,11 +276,24 @@ void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); - _run_method(src, tmp_for_thread, dst, _beta, _axis, window, _lut.data()); +#ifdef __aarch64__ + if (_lut) + { + _run_method(src, tmp_for_thread, dst, _beta, _axis, window, _lut->data()); + } + else +#endif // __aarch64__ + { + _run_method(src, tmp_for_thread, dst, _beta, _axis, window, nullptr); + } } else { +#ifdef __aarch64__ + _run_method(src, nullptr, dst, _beta, _axis, window, _lut_bf16.get()); +#else // __aarch64__ _run_method(src, nullptr, dst, _beta, _axis, window, nullptr); +#endif // __aarch64__ } } diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h index becaa42835..c297d37f3f 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.h +++ b/src/cpu/kernels/CpuSoftmaxKernel.h @@ -25,6 +25,7 @@ #define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H #include "src/core/common/Macros.h" +#include "src/core/helpers/LUTManager.h" #include "src/cpu/ICpuKernel.h" namespace arm_compute @@ -78,11 +79,14 @@ class CpuSoftmaxKernel : public ICpuKernel static const std::vector &get_available_kernels(); private: - float _beta{1.0f}; - SoftmaxKernelPtr _run_method{nullptr}; - std::string _name{}; - int _axis{}; - std::vector _lut = {}; + float _beta{1.0f}; + SoftmaxKernelPtr _run_method{nullptr}; + std::string _name{}; + int _axis{}; +#ifdef __aarch64__ + std::shared_ptr _lut{nullptr}; + std::shared_ptr _lut_bf16 = nullptr; +#endif // __aarch64__ }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp index 2897f4b242..c331d0bf02 100644 --- a/src/cpu/kernels/cast/generic/neon/fp16.cpp +++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2023 Arm Limited. + * Copyright (c) 2016-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -224,7 +224,7 @@ void neon_fp16_to_other_dt_cast( // Compute left-over elements for (; x < window_end_x; ++x) { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + *(dst_ptr + x) = utils::cast::saturate_static_cast(*(src_ptr + x)); } }, src, dst); @@ -256,7 +256,7 @@ void neon_fp16_to_other_dt_cast( // Compute left-over elements for (; x < window_end_x; ++x) { - *(dst_ptr + x) = utils::cast::saturate_cast(*(src_ptr + x)); + *(dst_ptr + x) = utils::cast::saturate_static_cast(*(src_ptr + x)); } }, src, dst); diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h index 78e3baf74b..1560b38ceb 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h +++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h @@ -24,6 +24,8 @@ #ifndef ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H #define ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H +#include "arm_compute/core/Helpers.h" + #include "src/core/NEON/NEAsymm.h" namespace arm_compute @@ -567,7 +569,7 @@ inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t & return out; } -inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) +inline float32x4x4_t load_quantized(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) { qasymm8x16_signed_t x = vld1q_s8(input1_ptr); const float32x4x4_t out = {{ @@ -596,11 +598,14 @@ inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) { - int32x4x4_t out = {{ - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + // Adjust offset with 0.5 to round to nearest. + const float32x4_t adj_offset = vaddq_f32(offset, vdupq_n_f32(0.5f)); + + const int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[3], invscale)), }}; store_quantized(output_ptr, out); } @@ -612,31 +617,48 @@ inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) vst1q_s8(output_ptr, vcombine_s8(pa, pb)); } -inline void store_quantized_signed(int8_t *output_ptr, - const float32x4x4_t &rf, - const float32x4_t &offset, - const float32x4_t &invscale) -{ - int32x4x4_t out = {{ - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), +inline void +store_quantized(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +{ + // Adjust offset to round to nearest. + const uint32x4x4_t cmp = {{ +#ifdef __aarch64__ + vcltzq_f32(rf.val[0]), + vcltzq_f32(rf.val[1]), + vcltzq_f32(rf.val[2]), + vcltzq_f32(rf.val[3]), +#else // __aarch64__ + vcltq_f32(rf.val[0], vdupq_n_f32(0.0f)), + vcltq_f32(rf.val[1], vdupq_n_f32(0.0f)), + vcltq_f32(rf.val[2], vdupq_n_f32(0.0f)), + vcltq_f32(rf.val[3], vdupq_n_f32(0.0f)), +#endif // __aarch64__ + }}; + const float32x4_t neg_point_5 = vdupq_n_f32(-0.5f); + const float32x4_t pos_point_5 = vdupq_n_f32(0.5f); + const float32x4x4_t adj_offset = {{ + vaddq_f32(offset, vbslq_f32(cmp.val[0], neg_point_5, pos_point_5)), + vaddq_f32(offset, vbslq_f32(cmp.val[1], neg_point_5, pos_point_5)), + vaddq_f32(offset, vbslq_f32(cmp.val[2], neg_point_5, pos_point_5)), + vaddq_f32(offset, vbslq_f32(cmp.val[3], neg_point_5, pos_point_5)), }}; - store_quantized_signed(output_ptr, out); -} -template -inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) -{ - return quantize_qasymm8(elementwise_arithm_op_scalar(a, b), qinfo); + const int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(adj_offset.val[0], rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(adj_offset.val[1], rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(adj_offset.val[2], rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(adj_offset.val[3], rf.val[3], invscale)), + }}; + store_quantized_signed(output_ptr, out); } -template -inline int8_t -elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +template ::value || std::is_same::value>> +inline Output elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) { - return quantize_qasymm8_signed(elementwise_arithm_op_scalar(a, b), qinfo); + const float res = elementwise_arithm_op_scalar(a, b); + return Qasymm8QuantizationHelper::quantize(res, qinfo); } template @@ -669,19 +691,23 @@ inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x return out; } -template -inline int elementwise_arithm_op_quantized_loop(int window_start_x, - int window_end_x, - int window_step_x, - const uint8_t *input1_ptr, - const uint8_t *input2_ptr, - uint8_t *output_ptr, - int32x4_t voffset1, - int32x4_t voffset2, - float32x4_t vscale1, - float32x4_t vscale2, - float32x4_t voffseto, - float32x4_t invvscaleo) +template ::value || std::is_same::value) && + (std::is_same::value || std::is_same::value)>> +inline int elementwise_arithm_op_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const Input *input1_ptr, + const Input *input2_ptr, + Output *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) { int x = window_start_x; for (; x <= (window_end_x - window_step_x); x += window_step_x) @@ -695,44 +721,22 @@ inline int elementwise_arithm_op_quantized_loop(int window_start_x, return x; } -template -inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, - int window_end_x, - int window_step_x, - const int8_t *input1_ptr, - const int8_t *input2_ptr, - int8_t *output_ptr, - int32x4_t voffset1, - int32x4_t voffset2, - float32x4_t vscale1, - float32x4_t vscale2, - float32x4_t voffseto, - float32x4_t invvscaleo) -{ - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Get inputs and compute output - const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); - const float32x4x4_t rf = elementwise_arithm_op(af, bf); - store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} - -template -inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, - int window_end_x, - int window_step_x, - const uint8_t *non_broadcast_input_ptr, - float32x4x4_t broadcast_vector, - uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, - float32x4_t vscale_non_broadcast, - float32x4_t voffseto, - float32x4_t invvscaleo, - bool reorder) +template ::value || std::is_same::value) && + (std::is_same::value || std::is_same::value)>> +inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const Input *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + Output *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) { int x = window_start_x; for (; x <= (window_end_x - window_step_x); x += window_step_x) @@ -745,44 +749,22 @@ inline int elementwise_arithm_op_quantized_broadcast_loop(int window_ } return x; } -template -inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, - int window_end_x, - int window_step_x, - const int8_t *non_broadcast_input_ptr, - float32x4x4_t broadcast_vector, - int8_t *output_ptr, - int32x4_t voffset_non_broadcast, - float32x4_t vscale_non_broadcast, - float32x4_t voffseto, - float32x4_t invvscaleo, - bool reorder) -{ - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = - load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const float32x4x4_t rf = - elementwise_arithm_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); - store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); - } - return x; -} -template -inline int elementwise_comp_op_quantized_loop(int window_start_x, - int window_end_x, - int window_step_x, - const uint8_t *input1_ptr, - const uint8_t *input2_ptr, - uint8_t *output_ptr, - int32x4_t voffset1, - int32x4_t voffset2, - float32x4_t vscale1, - float32x4_t vscale2, - float32x4_t voffseto, - float32x4_t invvscaleo) +template ::value || std::is_same::value>> +inline int elementwise_comp_op_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const Input *input1_ptr, + const Input *input2_ptr, + uint8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) { ARM_COMPUTE_UNUSED(voffseto, invvscaleo); int x = window_start_x; @@ -796,44 +778,20 @@ inline int elementwise_comp_op_quantized_loop(int window_start_x, return x; } -template -inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, - int window_end_x, - int window_step_x, - const int8_t *input1_ptr, - const int8_t *input2_ptr, - uint8_t *output_ptr, - int32x4_t voffset1, - int32x4_t voffset2, - float32x4_t vscale1, - float32x4_t vscale2, - float32x4_t voffseto, - float32x4_t invvscaleo) -{ - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); - const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); - const uint32x4x4_t rf = elementwise_comp_op(af, bf); - store_quantized(output_ptr + x, rf); - } - return x; -} - -template -inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, - int window_end_x, - int window_step_x, - const uint8_t *non_broadcast_input_ptr, - float32x4x4_t broadcast_vector, - uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, - float32x4_t vscale_non_broadcast, - float32x4_t voffseto, - float32x4_t invvscaleo, - bool reorder) +template ::value || std::is_same::value>> +inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const Input *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) { ARM_COMPUTE_UNUSED(voffseto, invvscaleo); int x = window_start_x; @@ -848,43 +806,21 @@ inline int elementwise_comp_op_quantized_broadcast_loop(int window_st return x; } -template -inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, - int window_end_x, - int window_step_x, - const int8_t *non_broadcast_input_ptr, - float32x4x4_t broadcast_vector, - uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, - float32x4_t vscale_non_broadcast, - float32x4_t voffseto, - float32x4_t invvscaleo, - bool reorder) -{ - ARM_COMPUTE_UNUSED(voffseto, invvscaleo); - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4x4_t af = - load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const uint32x4x4_t rf = - elementwise_comp_op(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); - store_quantized(output_ptr + x, rf); - } - return x; -} - +template ::value || std::is_same::value) && + (std::is_same::value || std::is_same::value)>> inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + Output (*scalar_func)(const float &, const float &, UniformQuantizationInfo), int (*broadcast_func)(int, int, int, - const uint8_t *, + const Input *, float32x4x4_t, - uint8_t *, + Output *, int32x4_t, float32x4_t, float32x4_t, @@ -893,9 +829,9 @@ inline void elementwise_op_quantized(const ITensor *in1, int (*neon_func)(int, int, int, - const uint8_t *, - const uint8_t *, - uint8_t *, + const Input *, + const Input *, + Output *, int32x4_t, int32x4_t, float32x4_t, @@ -903,277 +839,8 @@ inline void elementwise_op_quantized(const ITensor *in1, float32x4_t, float32x4_t)) { - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); - - // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero) - const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - - if (is_broadcast_across_x) - { - // Select the broadcast input on the X axis - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); - const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop( - win, - [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const uint8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, - broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, - voffseto, invvscaleo, !is_broadcast_input_2); - for (; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, - !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); - - // Input1 quantization info - const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); - const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); - - // Input2 quantization info - const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); - const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop( - win, - [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, - voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); - for (; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); - } -} - -inline void -elementwise_comp_quantized_signed(const ITensor *in1, - const ITensor *in2, - ITensor *out, - const Window &window, - uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, - int, - int, - const int8_t *, - float32x4x4_t, - uint8_t *, - int32x4_t, - float32x4_t, - float32x4_t, - float32x4_t, - const bool), - int (*neon_func)(int, - int, - int, - const int8_t *, - const int8_t *, - uint8_t *, - int32x4_t, - int32x4_t, - float32x4_t, - float32x4_t, - float32x4_t, - float32x4_t)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); - - const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - - if (is_broadcast_across_x) - { - // Select the broadcast input on the X axis - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); - const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop( - win, - [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, - broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, - voffseto, invvscaleo, !is_broadcast_input_2); - for (; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, - !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); - - // Input1 quantization info - const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); - const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); - - // Input2 quantization info - const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); - const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + using InputVector = wrapper::traits::neon_vector_t; - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop( - win, - [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, - voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); - for (; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); - } -} - -inline void -elementwise_op_quantized_signed(const ITensor *in1, - const ITensor *in2, - ITensor *out, - const Window &window, - int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, - int, - int, - const int8_t *, - float32x4x4_t, - int8_t *, - int32x4_t, - float32x4_t, - float32x4_t, - float32x4_t, - const bool), - int (*neon_func)(int, - int, - int, - const int8_t *, - const int8_t *, - int8_t *, - int32x4_t, - int32x4_t, - float32x4_t, - float32x4_t, - float32x4_t, - float32x4_t)) -{ // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); @@ -1218,19 +885,22 @@ elementwise_op_quantized_signed(const ITensor *in1, win, [&](const Coordinates &) { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const int8_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + const Input broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const InputVector broadcast_value_v = + wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag{}); + const float32x4x4_t broadcast_vector = vdequantize(broadcast_value_v, broadcast_qinfo); int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); for (; x < window_end_x; ++x) { - const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + const float afs = Qasymm8QuantizationHelper::dequantize(*(non_broadcast_input_ptr + x), + non_broadcast_qinfo); + const float bfs = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); } @@ -1262,16 +932,16 @@ elementwise_op_quantized_signed(const ITensor *in1, win, [&](const Coordinates &) { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); for (; x < window_end_x; ++x) { - const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + const float afs = Qasymm8QuantizationHelper::dequantize(*(input1_ptr + x), input1_qinfo); + const float bfs = Qasymm8QuantizationHelper::dequantize(*(input2_ptr + x), input2_qinfo); *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); } }, @@ -1282,33 +952,34 @@ elementwise_op_quantized_signed(const ITensor *in1, template void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar, - &elementwise_arithm_op_quantized_broadcast_loop, - &elementwise_arithm_op_quantized_loop); + elementwise_op_quantized(in1, in2, out, window, + &elementwise_arithm_op_quantized_scalar, + &elementwise_arithm_op_quantized_broadcast_loop, + &elementwise_arithm_op_quantized_loop); } template void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar, - &elementwise_arithm_op_quantized_signed_broadcast_loop, - &elementwise_arithm_op_quantized_singed_loop); + elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar, + &elementwise_arithm_op_quantized_broadcast_loop, + &elementwise_arithm_op_quantized_loop); } template void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, - &elementwise_comp_op_quantized_broadcast_loop, - &elementwise_comp_op_quantized_loop); + elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, + &elementwise_comp_op_quantized_broadcast_loop, + &elementwise_comp_op_quantized_loop); } template void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, - &elementwise_comp_op_quantized_signed_broadcast_loop, - &elementwise_comp_op_quantized_signed_loop); + elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar, + &elementwise_comp_op_quantized_broadcast_loop, + &elementwise_comp_op_quantized_loop); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/quantize/generic/neon/impl.h b/src/cpu/kernels/quantize/generic/neon/impl.h index 9954a7645e..ba7865cf43 100644 --- a/src/cpu/kernels/quantize/generic/neon/impl.h +++ b/src/cpu/kernels/quantize/generic/neon/impl.h @@ -77,6 +77,21 @@ inline vector_type vquantize_qasymm8(const float32x4x4_t &qv, co return vquantize_signed(qv, qi); } +template +inline vector_type vquantize_qasymm8(const float32x4x4_t &qv, const UniformRequantizationInfo &qi); + +template <> +inline vector_type vquantize_qasymm8(const float32x4x4_t &qv, const UniformRequantizationInfo &qi) +{ + return vquantize(qv, qi); +} + +template <> +inline vector_type vquantize_qasymm8(const float32x4x4_t &qv, const UniformRequantizationInfo &qi) +{ + return vquantize_signed(qv, qi); +} + template ::value, bool>::type> inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) { @@ -239,12 +254,17 @@ void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); + constexpr bool is_8bit_int = std::is_same::value || std::is_same::value; + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if (is_data_type_quantized_asymmetric(src->info()->data_type())) + UniformRequantizationInfo reqinfo(1.f, 0); + + if (is_8bit_int) { - uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + reqinfo = compute_requantization_scale_float_offset(uqinfo_in, uqinfo); } + #ifdef __aarch64__ constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; #else //__aarch64__ @@ -267,12 +287,26 @@ void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window int x = window_start_x; for (; x <= (window_end_x - window_step); x += window_step) { - wrapper::vstore(&output_ptr[x], vquantize_qasymm8(load_value(&input_ptr[x]), uqinfo)); + if (is_8bit_int) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8(load_value(&input_ptr[x]), reqinfo)); + } + else + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8(load_value(&input_ptr[x]), uqinfo)); + } } // Compute left-over elements for (; x < window_end_x; ++x) { - output_ptr[x] = Qasymm8QuantizationHelper::quantize(input_ptr[x], uqinfo, rounding_policy); + if (is_8bit_int) + { + output_ptr[x] = Qasymm8QuantizationHelper::quantize(input_ptr[x], reqinfo, rounding_policy); + } + else + { + output_ptr[x] = Qasymm8QuantizationHelper::quantize(input_ptr[x], uqinfo, rounding_policy); + } } }, input, output); diff --git a/src/cpu/kernels/reduction_layer/generic/neon/impl.h b/src/cpu/kernels/reduction_layer/generic/neon/impl.h index 3fa821d3a4..0c4a7c70c0 100644 --- a/src/cpu/kernels/reduction_layer/generic/neon/impl.h +++ b/src/cpu/kernels/reduction_layer/generic/neon/impl.h @@ -873,8 +873,7 @@ struct RedOpX_quantized if (op == ReductionOperation::MEAN_SUM) { - const int32_t resFinal = A * (static_cast(res)) + B; - + const float resFinal = A * (static_cast(res)) + B; *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(resFinal); } else @@ -1427,10 +1426,10 @@ struct RedOpYZW_quantized vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value4), vec_A); #ifdef __aarch64__ - vec_res_value1 = wrapper::vcvta(vec_res_value1_f); - vec_res_value2 = wrapper::vcvta(vec_res_value2_f); - vec_res_value3 = wrapper::vcvta(vec_res_value3_f); - vec_res_value4 = wrapper::vcvta(vec_res_value4_f); + vec_res_value1 = wrapper::vcvtn(vec_res_value1_f); + vec_res_value2 = wrapper::vcvtn(vec_res_value2_f); + vec_res_value3 = wrapper::vcvtn(vec_res_value3_f); + vec_res_value4 = wrapper::vcvtn(vec_res_value4_f); #else // defined(__aarch64__) vec_res_value1 = wrapper::vcvt(vec_res_value1_f); vec_res_value2 = wrapper::vcvt(vec_res_value2_f); @@ -1584,8 +1583,8 @@ struct RedOpYZW_quantized { // Apply previously calculated coefficients (with rounding on aarch64) #ifdef __aarch64__ - const int32_t res = - arm_compute::support::cpp11::round(A * (static_cast(res_value_q)) + B); + const int32_t res = arm_compute::round(A * (static_cast(res_value_q)) + B, + RoundingPolicy::TO_NEAREST_EVEN); #else // defined(__aarch64__) const int32_t res = A * (static_cast(res_value_q)) + B; #endif // __aarch64__ diff --git a/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp index f3d443f9aa..f449b4b9b8 100644 --- a/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp @@ -85,7 +85,7 @@ void sme2_qasymm8_softmax_kernel_512VL( // // * pn9: all-true for 32 bit values // * pn8: all-true for 8-bit values // - // * z0-z15 the 256 LUT values of exp(-scale*beta*x) for x in QASYMM8, stored as FP32 values + // * z0-z11, z20-z23 the 256 LUT values of exp(-scale*beta*x) for x in QASYMM8, stored as FP32 values // Prepares all constant values @@ -115,8 +115,12 @@ void sme2_qasymm8_softmax_kernel_512VL( // add x2, x2, #256 .inst 0xa040c448 //ld1w { z8.s - z11.s }, pn9/z, [x2] add x2, x2, #256 - .inst 0xa040c44c //ld1w { z12.s - z15.s }, pn9/z, [x2] + .inst 0xa040c454 //ld1w { z20.s - z23.s }, pn9/z, [x2] + dup z24.b, #0 + dup z25.b, #0 + dup z26.b, #0 + dup z27.b, #0 loop_3_start%=: // for index_3 in shape_3 downto 1 @@ -156,8 +160,8 @@ loop_1_start%=: find_max_body_start%=: cmp x1, x13 b.eq find_max_body_end%= - .inst 0xa0018374 // ld1b { z20.b - z23.b }, pn8/z, [x27, x1] z20-z23: x - .inst 0xc134b811 // umax { z16.b - z19.b }, { z16.b - z19.b }, { z20.b - z23.b } z16-z19: max_value = max(max_value, x) + .inst 0xa001836c // ld1b { z12.b - z15.b }, pn8/z, [x27, x1] z12-z15: x + .inst 0xc12cb811 // umax { z16.b - z19.b }, { z16.b - z19.b }, { z12.b - z15.b } z16-z19: max_value = max(max_value, x) add x1, x1, #256 // Advance index by 256 bytes/integers: Z registers = 2048-bit data = 256 8-bit integers. b find_max_body_start%= find_max_body_end%=: @@ -181,12 +185,17 @@ find_max_leftover_end%=: dup z16.b, z16.b[0] uunpklo z16.h, z16.b // Using unpack instructions to align the max value with the FP32 entries in the LUT for use in the TBX instruction uunpklo z16.s, z16.h + mov z12.d, z16.d // Save to z12, as z16 will be overwritten. mov x1, #0 // reset index - dup z25.s, #0 + dup z28.s, #0 mov x1, #0 + dup z13.s, #-16 + // ================================================== + // Step 2: Exponentiation and Summation + // ================================================== regularize_start%=: whilelo p1.b, x1, %x[length] b.none regularize_end%= @@ -201,192 +210,147 @@ regularize_start%=: punpkhi p5.h, p4.b punpklo p4.h, p4.b - ld1b z17.b, p1/z, [x27, x1] //z17: input data - - uunpklo z18.h, z17.b //Using unpack instructions to align the input QASYMM8 values with the FP32 entries in the LUT for use in the TBX instruction - uunpkhi z19.h, z17.b - - uunpklo z17.s, z18.h // z17 = low low input QASYMM8 values - uunpkhi z18.s, z18.h // z18 = low high input QASYMM8 values - - uunpkhi z20.s, z19.h // z20 = high high input QASYMM8 values - uunpklo z19.s, z19.h // z19 = high low input QASYMM8 values - - sub z17.s, z16.s, z17.s // z12: x = max_value - input_data - sub z18.s, z16.s, z18.s // z13: x = max_value - input_data - sub z19.s, z16.s, z19.s // z14: x = max_value - input_data - sub z20.s, z16.s, z20.s // z15: x = max_value - input_data - - tbx z21.s, z0.s, z17.s // Look-up entries 0-15 in the LUT. - tbx z22.s, z0.s, z18.s - tbx z23.s, z0.s, z19.s - tbx z24.s, z0.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z1.s, z17.s // Look-up entries 16-31 in the LUT. - tbx z22.s, z1.s, z18.s - tbx z23.s, z1.s, z19.s - tbx z24.s, z1.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z2.s, z17.s // Look-up entries 32-47 in the LUT. - tbx z22.s, z2.s, z18.s - tbx z23.s, z2.s, z19.s - tbx z24.s, z2.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z3.s, z17.s // Look-up entries 48-63 in the LUT. - tbx z22.s, z3.s, z18.s - tbx z23.s, z3.s, z19.s - tbx z24.s, z3.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z4.s, z17.s // Look-up entries 64-79 in the LUT. - tbx z22.s, z4.s, z18.s - tbx z23.s, z4.s, z19.s - tbx z24.s, z4.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z5.s, z17.s // Look-up entries 80-95 in the LUT. - tbx z22.s, z5.s, z18.s - tbx z23.s, z5.s, z19.s - tbx z24.s, z5.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z6.s, z17.s // Look-up entries 96-111 in the LUT. - tbx z22.s, z6.s, z18.s - tbx z23.s, z6.s, z19.s - tbx z24.s, z6.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z7.s, z17.s // Look-up entries 112-127 in the LUT. - tbx z22.s, z7.s, z18.s - tbx z23.s, z7.s, z19.s - tbx z24.s, z7.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z8.s, z17.s // Look-up entries 128-143 in the LUT. - tbx z22.s, z8.s, z18.s - tbx z23.s, z8.s, z19.s - tbx z24.s, z8.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z9.s, z17.s // Look-up entries 144-159 in the LUT. - tbx z22.s, z9.s, z18.s - tbx z23.s, z9.s, z19.s - tbx z24.s, z9.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z10.s, z17.s // Look-up entries 160-175 in the LUT. - tbx z22.s, z10.s, z18.s - tbx z23.s, z10.s, z19.s - tbx z24.s, z10.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z11.s, z17.s // Look-up entries 176-191 in the LUT. - tbx z22.s, z11.s, z18.s - tbx z23.s, z11.s, z19.s - tbx z24.s, z11.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z12.s, z17.s // Look-up entries 192-207 in the LUT. - tbx z22.s, z12.s, z18.s - tbx z23.s, z12.s, z19.s - tbx z24.s, z12.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z13.s, z17.s // Look-up entries 208-223 in the LUT. - tbx z22.s, z13.s, z18.s - tbx z23.s, z13.s, z19.s - tbx z24.s, z13.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z14.s, z17.s // Look-up entries 224-239 in the LUT. - tbx z22.s, z14.s, z18.s - tbx z23.s, z14.s, z19.s - tbx z24.s, z14.s, z20.s - - sub z17.s, z17.s, #16 - sub z18.s, z18.s, #16 - sub z19.s, z19.s, #16 - sub z20.s, z20.s, #16 - - tbx z21.s, z15.s, z17.s // Look-up entries 240-255 in the LUT. - tbx z22.s, z15.s, z18.s - tbx z23.s, z15.s, z19.s - tbx z24.s, z15.s, z20.s - - - st1w z21.s, p2, [x29, x1, LSL #2]// z21 store exp(-scale*beta*x) into the tmp tensor - fadd z25.s, p2/m, z25.s, z21.s + ld1b z16.b, p1/z, [x27, x1] //z16: input data + + uunpklo z17.h, z16.b //Using unpack instructions to align the input QASYMM8 values with the FP32 entries in the LUT for use in the TBX instruction + uunpkhi z18.h, z16.b + + uunpklo z16.s, z17.h // z16 = low low input QASYMM8 values + uunpkhi z17.s, z17.h // z17 = low high input QASYMM8 values + + uunpkhi z19.s, z18.h // z19 = high high input QASYMM8 values + uunpklo z18.s, z18.h // z18 = high low input QASYMM8 values + + sub z16.s, z12.s, z16.s // z16: x = max_value - input_data + sub z17.s, z12.s, z17.s // z17: x = max_value - input_data + sub z18.s, z12.s, z18.s // z18: x = max_value - input_data + sub z19.s, z12.s, z19.s // z19: x = max_value - input_data + + tbx z24.s, z0.s, z16.s // Look-up entries 0-15 in the LUT. + tbx z25.s, z0.s, z17.s + tbx z26.s, z0.s, z18.s + tbx z27.s, z0.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z1.s, z16.s // Look-up entries 16-31 in the LUT. + tbx z25.s, z1.s, z17.s + tbx z26.s, z1.s, z18.s + tbx z27.s, z1.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z2.s, z16.s // Look-up entries 32-47 in the LUT. + tbx z25.s, z2.s, z17.s + tbx z26.s, z2.s, z18.s + tbx z27.s, z2.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z3.s, z16.s // Look-up entries 48-63 in the LUT. + tbx z25.s, z3.s, z17.s + tbx z26.s, z3.s, z18.s + tbx z27.s, z3.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z4.s, z16.s // Look-up entries 64-79 in the LUT. + tbx z25.s, z4.s, z17.s + tbx z26.s, z4.s, z18.s + tbx z27.s, z4.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z5.s, z16.s // Look-up entries 80-95 in the LUT. + tbx z25.s, z5.s, z17.s + tbx z26.s, z5.s, z18.s + tbx z27.s, z5.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z6.s, z16.s // Look-up entries 96-111 in the LUT. + tbx z25.s, z6.s, z17.s + tbx z26.s, z6.s, z18.s + tbx z27.s, z6.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z7.s, z16.s // Look-up entries 112-127 in the LUT. + tbx z25.s, z7.s, z17.s + tbx z26.s, z7.s, z18.s + tbx z27.s, z7.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z8.s, z16.s // Look-up entries 128-143 in the LUT. + tbx z25.s, z8.s, z17.s + tbx z26.s, z8.s, z18.s + tbx z27.s, z8.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z9.s, z16.s // Look-up entries 144-159 in the LUT. + tbx z25.s, z9.s, z17.s + tbx z26.s, z9.s, z18.s + tbx z27.s, z9.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z10.s, z16.s // Look-up entries 160-175 in the LUT. + tbx z25.s, z10.s, z17.s + tbx z26.s, z10.s, z18.s + tbx z27.s, z10.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z11.s, z16.s // Look-up entries 176-191 in the LUT. + tbx z25.s, z11.s, z17.s + tbx z26.s, z11.s, z18.s + tbx z27.s, z11.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z20.s, z16.s // Look-up entries 192-207 in the LUT. + tbx z25.s, z20.s, z17.s + tbx z26.s, z20.s, z18.s + tbx z27.s, z20.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z21.s, z16.s // Look-up entries 208-223 in the LUT. + tbx z25.s, z21.s, z17.s + tbx z26.s, z21.s, z18.s + tbx z27.s, z21.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z22.s, z16.s // Look-up entries 224-239 in the LUT. + tbx z25.s, z22.s, z17.s + tbx z26.s, z22.s, z18.s + tbx z27.s, z22.s, z19.s + + .inst 0xc1adab10 // add {z16.s-z19.s}, {z16.s-z19.s}, z13.s + + tbx z24.s, z23.s, z16.s // Look-up entries 240-255 in the LUT. + tbx z25.s, z23.s, z17.s + tbx z26.s, z23.s, z18.s + tbx z27.s, z23.s, z19.s + + + st1w z24.s, p2, [x29, x1, LSL #2]// z24 store exp(-scale*beta*x) into the tmp tensor + fadd z28.s, p2/m, z28.s, z24.s add x1, x1, #16 - st1w z22.s, p3, [x29, x1, LSL #2]// z22 store exp(-scale*beta*x) into the tmp tensor - fadd z25.s, p3/m, z25.s, z22.s + st1w z25.s, p3, [x29, x1, LSL #2]// z25 store exp(-scale*beta*x) into the tmp tensor + fadd z28.s, p3/m, z28.s, z25.s add x1, x1, #16 - st1w z23.s, p4, [x29, x1, LSL #2]// z23 store exp(-scale*beta*x) into the tmp tensor - fadd z25.s, p4/m, z25.s, z23.s + st1w z26.s, p4, [x29, x1, LSL #2]// z26 store exp(-scale*beta*x) into the tmp tensor + fadd z28.s, p4/m, z28.s, z26.s add x1, x1, #16 - st1w z24.s, p5, [x29, x1, LSL #2]// z24 store exp(-scale*beta*x) into the tmp tensor - fadd z25.s, p5/m, z25.s, z24.s + st1w z27.s, p5, [x29, x1, LSL #2]// z27 store exp(-scale*beta*x) into the tmp tensor + fadd z28.s, p5/m, z28.s, z27.s add x1, x1, #16 b regularize_start%= @@ -395,9 +359,9 @@ regularize_end%=: mov w9, 0x0000 movk w9, 0x4380, LSL #16 // Moving 256.f into w9 to scale - via multiplication (division by reciprocal) - the floating point [0,1] range of the results to the [0,255] integer range of QASYMM8 dup z29.s, w9 - faddv s25, p0, z25.s - fdiv s25, s29, s25 - dup z25.s, z25.s[0] // z25: 256.f/sum. 256 is needed to get the full range and 1/sum is part of softmax. + faddv s28, p0, z28.s + fdiv s28, s29, s28 + dup z28.s, z28.s[0] // z28: 256.f/sum. 256 is needed to get the full range and 1/sum is part of softmax. // ================================================== // Step 3: Normalize @@ -408,36 +372,36 @@ normalize_body_start%=: b.eq normalize_body_end%= mov x2, x1 // Preserve the index into x2 for the final store to dst. - .inst 0xa001c7b0 // ld1w { z16.s - z19.s }, pn9/z, [x29, x1, lsl #2] + .inst 0xa001c7ac // ld1w { z12.s - z15.s }, pn9/z, [x29, x1, lsl #2] add x1, x1, #64 - .inst 0xa001c7b4 // ld1w { z20.s - z23.s }, pn9/z, [x29, x1, lsl #2] + .inst 0xa001c7b0 // ld1w { z16.s - z19.s }, pn9/z, [x29, x1, lsl #2] add x1, x1, #64 - // z16-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. - fmul z16.s, z25.s, z16.s - fmul z17.s, z25.s, z17.s - fmul z18.s, z25.s, z18.s - fmul z19.s, z25.s, z19.s - fmul z20.s, z25.s, z20.s - fmul z21.s, z25.s, z21.s - fmul z22.s, z25.s, z22.s - fmul z23.s, z25.s, z23.s - - // z16-z23: convert the FP32 values from the tmp tensor to uint32. + // z12-z19: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. + fmul z12.s, z28.s, z12.s + fmul z13.s, z28.s, z13.s + fmul z14.s, z28.s, z14.s + fmul z15.s, z28.s, z15.s + fmul z16.s, z28.s, z16.s + fmul z17.s, z28.s, z17.s + fmul z18.s, z28.s, z18.s + fmul z19.s, z28.s, z19.s + + // z12-z19: convert the FP32 values from the tmp tensor to uint32. + fcvtzu z12.s, p0/m, z12.s + fcvtzu z13.s, p0/m, z13.s + fcvtzu z14.s, p0/m, z14.s + fcvtzu z15.s, p0/m, z15.s fcvtzu z16.s, p0/m, z16.s fcvtzu z17.s, p0/m, z17.s fcvtzu z18.s, p0/m, z18.s fcvtzu z19.s, p0/m, z19.s - fcvtzu z20.s, p0/m, z20.s - fcvtzu z21.s, p0/m, z21.s - fcvtzu z22.s, p0/m, z22.s - fcvtzu z23.s, p0/m, z23.s - // z16-z17: narrow the uint32 values into uint8 and saturate them. - .inst 0xc133e230 // uqcvt z16.b, { z16.s - z19.s } - .inst 0xc133e2b1 // uqcvt z17.b, { z20.s - z23.s } + // z12-z13: narrow the uint32 values into uint8 and saturate them. + .inst 0xc133e1ac // uqcvt z12.b, { z12.s - z15.s } + .inst 0xc133e22d // uqcvt z13.b, { z16.s - z19.s } - dup z20.s, z25.s[0] // Juggling the value to z20 as z25 will be overwritten by the load below + dup z16.s, z28.s[0] // Juggling the value to z16 as z28 will be overwritten by the load below .inst 0xa001c7b8 // ld1w { z24.s - z27.s }, pn9/z, [x29, x1, lsl #2] add x1, x1, #64 @@ -445,14 +409,14 @@ normalize_body_start%=: add x1, x1, #64 // z24-z31: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. - fmul z24.s, z20.s, z24.s - fmul z25.s, z20.s, z25.s - fmul z26.s, z20.s, z26.s - fmul z27.s, z20.s, z27.s - fmul z28.s, z20.s, z28.s - fmul z29.s, z20.s, z29.s - fmul z30.s, z20.s, z30.s - fmul z31.s, z20.s, z31.s + fmul z24.s, z16.s, z24.s + fmul z25.s, z16.s, z25.s + fmul z26.s, z16.s, z26.s + fmul z27.s, z16.s, z27.s + fmul z28.s, z16.s, z28.s + fmul z29.s, z16.s, z29.s + fmul z30.s, z16.s, z30.s + fmul z31.s, z16.s, z31.s // z24-z31: convert the FP32 values from the tmp tensor to uint32. fcvtzu z24.s, p0/m, z24.s @@ -464,13 +428,13 @@ normalize_body_start%=: fcvtzu z30.s, p0/m, z30.s fcvtzu z31.s, p0/m, z31.s - // z18-z19: narrow the uint32 values into uint8 and saturate them. - .inst 0xc133e332 // uqcvt z18.b, { z24.s - z27.s } - .inst 0xc133e3b3 // uqcvt z19.b, { z28.s - z31.s } + // z14-z15: narrow the uint32 values into uint8 and saturate them. + .inst 0xc133e32e // uqcvt z14.b, { z24.s - z27.s } + .inst 0xc133e3af // uqcvt z15.b, { z28.s - z31.s } - .inst 0xa0228390 // st1b { z16.b - z19.b }, pn8, [x28, x2] + .inst 0xa022838c // st1b { z12.b - z15.b }, pn8, [x28, x2] - dup z25.s, z20.s[0] // Juggling the value back to z25 as z20 will be overwritten by the next iteration or z25 will be used below. + dup z28.s, z16.s[0] // Juggling the value back to z28 as z16 will be overwritten by the next iteration b normalize_body_start%= normalize_body_end%=: @@ -491,32 +455,32 @@ normalize_leftover_start%=: mov x2, x1 // Preserve the index into x2 for the final store to dst. - // z20-z23: load exp(-scale*beta*x) from the tmp tensor - ld1w z20.s, p2/z, [x29, x1, LSL #2] + // z12-z15: load exp(-scale*beta*x) from the tmp tensor + ld1w z12.s, p2/z, [x29, x1, LSL #2] add x1, x1, #16 - ld1w z21.s, p3/z, [x29, x1, LSL #2] + ld1w z13.s, p3/z, [x29, x1, LSL #2] add x1, x1, #16 - ld1w z22.s, p4/z, [x29, x1, LSL #2] + ld1w z14.s, p4/z, [x29, x1, LSL #2] add x1, x1, #16 - ld1w z23.s, p5/z, [x29, x1, LSL #2] + ld1w z15.s, p5/z, [x29, x1, LSL #2] add x1, x1, #16 - // z20-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. - fmul z20.s, z25.s, z20.s - fmul z21.s, z25.s, z21.s - fmul z22.s, z25.s, z22.s - fmul z23.s, z25.s, z23.s + // z12-z15: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. + fmul z12.s, z28.s, z12.s + fmul z13.s, z28.s, z13.s + fmul z14.s, z28.s, z14.s + fmul z15.s, z28.s, z15.s - // z20-23: convert the FP32 values from the tmp tensor to uint32. - fcvtzu z20.s, p0/m, z20.s - fcvtzu z21.s, p0/m, z21.s - fcvtzu z22.s, p0/m, z22.s - fcvtzu z23.s, p0/m, z23.s + // z12-z15: convert the FP32 values from the tmp tensor to uint32. + fcvtzu z12.s, p0/m, z12.s + fcvtzu z13.s, p0/m, z13.s + fcvtzu z14.s, p0/m, z14.s + fcvtzu z15.s, p0/m, z15.s - .inst 0xc133e2b3 // uqcvt z19.b, { z20.s - z23.s }, narrow the uint32 values into uint8 and saturate them into z19. + .inst 0xc133e1b3 // uqcvt z19.b, { z12.s - z15.s }, narrow the uint32 values into uint8 and saturate them into z19. st1b z19.b, p1, [x28, x2] @@ -550,7 +514,7 @@ loop_3_end%=: [dst_stride_3] "r"(dst_strides[3]), // [length] "r"(shape[0]) // : "cc", "memory", // - "p0", "p1", "p2", "p3", "p4", // + "p0", "p1", "p2", "p3", "p4", "p5", // "x2", "x9", "x13", // "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", // "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", // diff --git a/src/cpu/kernels/softmax/generic/sve/impl_bf16.cpp b/src/cpu/kernels/softmax/generic/sve/impl_bf16.cpp new file mode 100644 index 0000000000..e0b85d91f3 --- /dev/null +++ b/src/cpu/kernels/softmax/generic/sve/impl_bf16.cpp @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" + +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ + +void sve_softmax_bf16(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const void *lut_ptr) +{ + ARM_COMPUTE_UNUSED(tmp); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_UNUSED(axis); + + ARM_COMPUTE_ERROR_ON_NULLPTR(lut_ptr); + const auto lut_fp16_ptr = reinterpret_cast(lut_ptr); + + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + Iterator in_it(in, window); + Iterator out_it(out, window); + + const auto all_true_pg = wrapper::svptrue(); + const auto all_true_pg_f32 = wrapper::svptrue(); + const auto all_true_pg_u32 = wrapper::svptrue(); + const int vec_count = wrapper::svcnt(); + + execute_window_loop( + window, + [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + + /* Compute Max: unlike in the conventional Softmax, we subtract the maximum value in the axis from each input (both in numerator and denominator) to reduce overall magnitude while maintaining correctness of output */ + float32_t max_val(std::numeric_limits::lowest()); + { + auto vec_max = wrapper::svdup_n(support::cpp11::lowest()); + + int x = 0; + svbool_t pg = wrapper::svwhilelt(x, input_width); + const svbool_t p_32_true = svptrue_b32(); + + svbool_t pg_u16 = wrapper::svwhilelt(x, input_width); + svbool_t pg_f32_low = svunpklo(pg_u16); + svbool_t pg_f32_high = svunpkhi(pg_u16); + do + { + const svuint16_t current_value_bf16 = svld1(pg, in_ptr + x); + + svuint32_t current_value_u32_low = svunpklo(current_value_bf16); + svuint32_t current_value_u32_high = svunpkhi(current_value_bf16); + + current_value_u32_low = svlsl_n_u32_z(p_32_true, current_value_u32_low, 16); + current_value_u32_high = svlsl_n_u32_z(p_32_true, current_value_u32_high, 16); + + const svfloat32_t current_value_fp32_low = svreinterpret_f32_u32(current_value_u32_low); + const svfloat32_t current_value_fp32_high = svreinterpret_f32_u32(current_value_u32_high); + + vec_max = svmax_m(pg_f32_low, vec_max, current_value_fp32_low); + vec_max = svmax_m(pg_f32_high, vec_max, current_value_fp32_high); + + x += vec_count; + pg = wrapper::svwhilelt(x, input_width); + pg_u16 = wrapper::svwhilelt(x, input_width); + pg_f32_low = svunpklo(pg_u16); + pg_f32_high = svunpkhi(pg_u16); + } while (svptest_any(all_true_pg, pg)); + + // Reduce vec to single max value + max_val = svmaxv(all_true_pg, vec_max); + } + float32_t sum(0.f); + { + /* Init sum to zero */ + svfloat32_t vec_sum = wrapper::svdup_n(static_cast(0)); + const svfloat32_t vec_max = wrapper::svdup_n(max_val); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + + svbool_t pg = wrapper::svwhilelt(x, input_width); + svbool_t pg_u16 = wrapper::svwhilelt(x, input_width); + + svbool_t pg_f32_low = svunpklo(pg_u16); + svbool_t pg_f32_high = svunpkhi(pg_u16); + + do + { + const svuint16_t vec_elements = svld1(pg, in_ptr + x); + + svuint32_t current_value_u32_low = svunpklo(vec_elements); + svuint32_t current_value_u32_high = svunpkhi(vec_elements); + + current_value_u32_low = svlsl_n_u32_z(all_true_pg_u32, current_value_u32_low, 16); + current_value_u32_high = svlsl_n_u32_z(all_true_pg_u32, current_value_u32_high, 16); + + const svfloat32_t current_value_fp32_low = svreinterpret_f32_u32(current_value_u32_low); + const svfloat32_t current_value_fp32_high = svreinterpret_f32_u32(current_value_u32_high); + + /* The aforementioned (on line 71) subtraction to reduce magnitude below, effectively a division by the exponentiated maximum value in the current axis */ + svfloat32_t vec_subbed_low_fp32 = svsub_z(pg_f32_low, current_value_fp32_low, vec_max); + svfloat32_t vec_subbed_high_fp32 = svsub_z(pg_f32_high, current_value_fp32_high, vec_max); + + const svuint16_t vec_subbed_low_uint16 = svreinterpret_u16_u32( + svlsr_n_u32_z(all_true_pg_u32, svreinterpret_u32_f32(vec_subbed_low_fp32), 16)); + const svuint16_t vec_subbed_high_uint16 = svreinterpret_u16_u32( + svlsr_n_u32_z(all_true_pg_u32, svreinterpret_u32_f32(vec_subbed_high_fp32), 16)); + + // Use LUT to get x : e^x*b + const svuint32_t loaded_exp_16bit_values_low = svld1uh_gather_index_u32( + pg_f32_low, lut_fp16_ptr, svreinterpret_u32_u16(vec_subbed_low_uint16)); + const svuint32_t loaded_exp_16bit_values_high = svld1uh_gather_index_u32( + pg_f32_high, lut_fp16_ptr, svreinterpret_u32_u16(vec_subbed_high_uint16)); + + // Recombine LUT values + const svuint16_t exp_bf16 = svuzp1(svreinterpret_u16_u32(loaded_exp_16bit_values_low), + svreinterpret_u16_u32(loaded_exp_16bit_values_high)); + + /* This store is not the final output value, the output tensor is used to store the numerator/dividend of the softmax operation for use in the final step + as there are likely not enough registers for a whole axis' values */ + svst1(pg, out_ptr + x, exp_bf16); + + svuint32_t exp_u32_low = svunpklo(exp_bf16); + svuint32_t exp_u32_high = svunpkhi(exp_bf16); + + exp_u32_low = svlsl_n_u32_z(all_true_pg_u32, exp_u32_low, 16); + exp_u32_high = svlsl_n_u32_z(all_true_pg_u32, exp_u32_high, 16); + + const svfloat32_t exp_fp32_low = svreinterpret_f32_u32(exp_u32_low); + const svfloat32_t exp_fp32_high = svreinterpret_f32_u32(exp_u32_high); + + vec_sum = svadd_m(pg_f32_low, vec_sum, exp_fp32_low); + vec_sum = svadd_m(pg_f32_high, vec_sum, exp_fp32_high); + + x += vec_count; + pg = wrapper::svwhilelt(x, input_width); + pg_u16 = wrapper::svwhilelt(x, input_width); + + pg_f32_low = svunpklo(pg_u16); + pg_f32_high = svunpkhi(pg_u16); + } while (svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + sum = svaddv(all_true_pg_f32, vec_sum); + sum = float32_t(1) / sum; + } + + /* Normalize exponentials */ + { + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt(x, input_width); + svbool_t pg_u16 = wrapper::svwhilelt(x, input_width); + svbool_t pg_f32_low = svunpklo(pg_u16); + svbool_t pg_f32_high = svunpkhi(pg_u16); + + do + { + const svuint16_t vec_in = svld1(pg, out_ptr + x); + + svuint32_t current_value_u32_low = svunpklo(vec_in); + svuint32_t current_value_u32_high = svunpkhi(vec_in); + + current_value_u32_low = svlsl_n_u32_z(all_true_pg_u32, current_value_u32_low, 16); + current_value_u32_high = svlsl_n_u32_z(all_true_pg_u32, current_value_u32_high, 16); + + const svfloat32_t current_value_fp32_low = svreinterpret_f32_u32(current_value_u32_low); + const svfloat32_t current_value_fp32_high = svreinterpret_f32_u32(current_value_u32_high); + + const svfloat32_t normalized_value_fp32_low = + svmul_z(pg_f32_low, current_value_fp32_low, wrapper::svdup_n(sum)); + const svfloat32_t normalized_value_fp32_high = + svmul_z(pg_f32_high, current_value_fp32_high, wrapper::svdup_n(sum)); + + const svuint16_t normalized_value_low_uint16 = svreinterpret_u16_u32( + svlsr_n_u32_z(all_true_pg_u32, svreinterpret_u32_f32(normalized_value_fp32_low), 16)); + const svuint16_t normalized_value_high_uint16 = svreinterpret_u16_u32( + svlsr_n_u32_z(all_true_pg_u32, svreinterpret_u32_f32(normalized_value_fp32_high), 16)); + + const svuint16_t normalized_value_bf16 = + svuzp1(normalized_value_low_uint16, normalized_value_high_uint16); + + svst1(pg, out_ptr + x, normalized_value_bf16); + + x += vec_count; + pg = wrapper::svwhilelt(x, input_width); + pg_u16 = wrapper::svwhilelt(x, input_width); + pg_f32_low = svunpklo(pg_u16); + pg_f32_high = svunpkhi(pg_u16); + } while (svptest_any(all_true_pg, pg)); + } + }, + in_it, out_it); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h index 9b11f1eaed..1e39581ef2 100644 --- a/src/cpu/kernels/softmax/list.h +++ b/src/cpu/kernels/softmax/list.h @@ -74,6 +74,18 @@ void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in, #endif // ARM_COMPUTE_ENABLE_SME2 +#ifdef ARM_COMPUTE_ENABLE_BF16 + +void sve_softmax_bf16(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const void *lut_ptr); + +#endif // ARM_COMPUTE_ENABLE_BF16 + #undef DECLARE_SOFTMAX_KERNEL } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h index 73c85f2dcd..2563270133 100644 --- a/src/cpu/operators/CpuDirectConv2d.h +++ b/src/cpu/operators/CpuDirectConv2d.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,14 +21,15 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H -#define ARM_COMPUTE_CPU_DIRECTCONV2D_H +#ifndef ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV2D_H +#define ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV2D_H #include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/Tensor.h" @@ -56,7 +57,10 @@ namespace cpu class CpuDirectConv2d : public ICpuOperator { public: - CpuDirectConv2d(std::shared_ptr memory_manager = nullptr); + CpuDirectConv2d(std::shared_ptr memory_manager); + CpuDirectConv2d() : CpuDirectConv2d(MemoryManagerOnDemand::make_default()) + { + } ~CpuDirectConv2d(); /** Set the input, weights, biases and output tensors. * @@ -112,4 +116,4 @@ class CpuDirectConv2d : public ICpuOperator }; } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */ +#endif // ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV2D_H diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h index 3ad1e09a14..3c2a435042 100644 --- a/src/cpu/operators/CpuDirectConv3d.h +++ b/src/cpu/operators/CpuDirectConv3d.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H -#define ARM_COMPUTE_CPU_DIRECTCONV3D_H +#ifndef ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV3D_H +#define ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV3D_H #include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" @@ -30,6 +30,7 @@ #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/Tensor.h" @@ -54,7 +55,10 @@ namespace cpu class CpuDirectConv3d : public ICpuOperator { public: - CpuDirectConv3d(std::shared_ptr memory_manager = nullptr); + CpuDirectConv3d(std::shared_ptr memory_manager); + CpuDirectConv3d() : CpuDirectConv3d(MemoryManagerOnDemand::make_default()) + { + } ~CpuDirectConv3d(); /** Set the input, weights, biases and output tensor info. * @@ -104,4 +108,4 @@ class CpuDirectConv3d : public ICpuOperator }; } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DIRECTCONV3D_H */ +#endif // ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV3D_H diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp index 0ea3c249df..f7af2d7fc2 100644 --- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -361,6 +361,8 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.pretranspose_A(), "Matrix A already pretransposed is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.pretranspose_B(), "Matrix B already pretransposed is not supported"); // When using accumulation(in place summation), for now, the only supported DataType for output is S32. if (gemm_info.accumulate()) diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp index acc620edc6..38f53cf53c 100644 --- a/src/cpu/operators/CpuMatMul.cpp +++ b/src/cpu/operators/CpuMatMul.cpp @@ -164,9 +164,13 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY; ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, lhs_to_use, rhs_to_use, nullptr, dst, gemm_info)); + + // Set gemm weights info to the one returned by has_opt_impl because the user query the kernel for the format to be set. + gemm_info.weight_format = expected_weight_format; } - cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info); + ARM_COMPUTE_RETURN_ON_ERROR( + cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info)); return Status{}; } @@ -251,9 +255,11 @@ void CpuMatMul::configure(ITensorInfo *lhs, { _gemm_info.weight_format = WeightFormat::ANY; arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY; - ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, &lhs_to_use, - &rhs_to_use, nullptr, dst, _gemm_info)); - // Set gemm weights info to the one returned by has_opt_impl + Status ret = cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, &lhs_to_use, &rhs_to_use, + nullptr, dst, _gemm_info); + ARM_COMPUTE_ERROR_THROW_ON(ret); + + // Set gemm weights info to the one returned by has_opt_impl because the user query the kernel for the format to be set. _gemm_info.weight_format = expected_weight_format; // has_opt_impl may return a non fast math kernel, even if we requested one _gemm_info.fast_mode = arm_compute::is_fixed_format_fast_math(expected_weight_format); @@ -264,6 +270,7 @@ void CpuMatMul::configure(ITensorInfo *lhs, _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, _gemm_info); // c is nullptr as bias not supported in MatMul + ARM_COMPUTE_EXIT_ON_MSG(!_asm_glue->is_configured(), "Error in CpuGemmAssemblyDispatch configuration"); // Specify memory requirements for intermediate tensors auto asm_mem_req = _asm_glue->workspace(); // Specify memory required by gemm kernel diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp index bbffcf55a3..2d8cfceb91 100644 --- a/src/gpu/cl/kernels/ClCastKernel.cpp +++ b/src/gpu/cl/kernels/ClCastKernel.cpp @@ -54,12 +54,18 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_RETURN_ERROR_ON(src == dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, - DataType::QSYMM8_PER_CHANNEL, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32, DataType::S64, DataType::U64); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8, - DataType::S16, DataType::U16, DataType::U32, DataType::S32, - DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different"); + DataType::QSYMM8_PER_CHANNEL, DataType::QASYMM16, DataType::QSYMM16, DataType::S16, DataType::U16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32, DataType::S64, DataType::U64); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::QASYMM16, DataType::QSYMM16, DataType::S16, DataType::U16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + const DataType src_dtype = get_underlying_data_type(src->data_type()); + const DataType dst_dtype = get_underlying_data_type(dst->data_type()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src_dtype == dst_dtype, "src and dst data types must be different"); // Validate in case of configured dst if (dst->total_size() > 0) @@ -83,6 +89,9 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + const DataType src_dtype = src->data_type(); + const DataType dst_dtype = dst->data_type(); + // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given) set_shape_if_empty(*dst, src->tensor_shape()); @@ -91,24 +100,24 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, auto padding_info = get_padding_info({src, dst}); // Get data sizes - const size_t src_size = data_size_from_type(src->data_type()); - const size_t dst_size = data_size_from_type(dst->data_type()); + const size_t src_size = data_size_from_type(src_dtype); + const size_t dst_size = data_size_from_type(dst_dtype); // Get number of elements to process per iterations const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0)); // Set build options + CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src_dtype)); + build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst_dtype)); // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined - build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE"); - build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), - "-DIS_DATA_TYPE_FLOAT"); - build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED"); + build_opts.add_option_if(is_data_type_float(src_dtype) || policy == ConvertPolicy::SATURATE, "-DSATURATE"); + build_opts.add_option_if(dst_dtype == DataType::QASYMM8 && is_data_type_quantized_per_channel(src_dtype), + "-DQSYMM8_PER_CHANNEL_TO_QASYMM8"); // Create kernel const std::string kernel_name = (src_size >= dst_size) ? "cast_down" : "cast_up"; @@ -128,7 +137,7 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, // Set config_id for enabling LWS tuning _config_id = kernel_name; _config_id += "_"; - _config_id += lower_string(string_from_data_type(src->data_type())); + _config_id += lower_string(string_from_data_type(src_dtype)); _config_id += "_"; _config_id += support::cpp11::to_string(src->dimension(0)); _config_id += "_"; diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp index e8df420f67..a01e31559f 100644 --- a/src/gpu/cl/kernels/ClQuantizeKernel.cpp +++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" @@ -80,56 +81,28 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const const int input_width_x = src->tensor_shape().x(); const bool multi_access_x = (input_width_x / vec_size_x > 0); - const UniformQuantizationInfo qinfo = dst->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst->quantization_info().uniform(); const DataType output_data_type = dst->data_type(); - float scale_to_apply = qinfo.scale; - int32_t offset_to_apply = qinfo.offset; + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT"); + if (is_data_type_quantized_asymmetric(src->data_type())) { - /* - * In case of requantization of a quantized input tensor to an output tensor with another quantization - * instead of of apply dequantization and then a quantization functions, we just compute new scale and - * offset to apply. - * - * Assuming: - * - q_i as input quantized value - * - q_o as output quantized value - * - z_i as input quantization offset value - * - z_o as output quantization offset value - * - s_i as input quantization scale value - * - s_o as output quantization scale value - * - z_n as new quantization offset value - * - s_n as new quantization scale value - * - * q_o = ( q_i - z_i ) * s_i / s_o + z_o - * - * We can rewrite the formula as: - * - * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o - * - * q_o = q_i / s_n + z_n - * - * Where: - * - * s_n = s_o / s_i - * - * z_n = - z_i * s_i / s_o + z_o - * - */ - const UniformQuantizationInfo qinfo_in = src->quantization_info().uniform(); - scale_to_apply /= qinfo_in.scale; - // In order to minimize flooring we convert the offset to a float, - // then compute the new offset in the float domain, - // finally we convert it back as int32_t - offset_to_apply -= static_cast(static_cast(qinfo_in.offset) * qinfo_in.scale / qinfo.scale); + const UniformQuantizationInfo src_qinfo = src->quantization_info().uniform(); + + const UniformRequantizationInfo reqinfo = compute_requantization_scale_float_offset(src_qinfo, dst_qinfo); + + build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(reqinfo.scale)); + build_opts.add_option("-DOFFSET=" + float_to_string_with_full_precision(reqinfo.offset)); + } + else + { + build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(dst_qinfo.scale)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(dst_qinfo.offset)); } - // Create kernel - CLBuildOptions build_opts; - build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT"); - build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply)); - build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type)); diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp index 8f26ef003d..7636736983 100644 --- a/src/gpu/cl/operators/ClCast.cpp +++ b/src/gpu/cl/operators/ClCast.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -37,6 +37,8 @@ void ClCast::configure(const ClCompileContext &compile_context, ConvertPolicy policy) { ARM_COMPUTE_LOG_PARAMS(src, dst, policy); + ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, policy)); + auto k = std::make_unique(); k->configure(compile_context, src, dst, policy); _kernel = std::move(k); @@ -44,6 +46,12 @@ void ClCast::configure(const ClCompileContext &compile_context, Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) { + // This operation mode is supported by ClCastKernel, however it has an unusual + // casting behavior, which is not like casting between Int8 & UInt8. Therefore, + // we do not expose this mode in the public api + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::QSYMM8_PER_CHANNEL && + dst->data_type() == DataType::QASYMM8); + return kernels::ClCastKernel::validate(src, dst, policy); } } // namespace opencl diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h index 25d2293673..9469a8018e 100644 --- a/src/gpu/cl/operators/ClCast.h +++ b/src/gpu/cl/operators/ClCast.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CL_CAST_H -#define ARM_COMPUTE_CL_CAST_H +#ifndef ACL_SRC_GPU_CL_OPERATORS_CLCAST_H +#define ACL_SRC_GPU_CL_OPERATORS_CLCAST_H #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -42,20 +42,11 @@ class ClCast : public IClOperator * Valid data layouts: * - All * - * Valid data type configurations: - * |src |dst | - * |:--------------|:--------------------------------------| - * |U8 | S8, U16, S16, U32, S32, F16, F32 | - * |U16 | U8, S8, S16, U32, S32, F16, F32 | - * |S16 | U8, S8, U16, U32, S32, F16, F32 | - * |U32 | U8, S8, U16, S16, S32, F16, F32 | - * |S32 | U8, S8, U16, S16, U32, F16, F32 | - * |F16 | U8, S8, U16, S16, U32, F32 | - * |F32 | U8, S8, U16, S16, U32, F16 | + * For data type configurations supported, please have a look at @ref CLCast * * @param[in] compile_context The compile context to be used. - * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. - * @param[out] dst The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] src The source tensor to convert. + * @param[out] dst The destinatio tensor. * @param[in] policy Conversion policy. */ void @@ -70,4 +61,4 @@ class ClCast : public IClOperator }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_CAST_H */ +#endif // ACL_SRC_GPU_CL_OPERATORS_CLCAST_H diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp index 5fa9ea47e9..f6924d703e 100644 --- a/src/runtime/MemoryManagerOnDemand.cpp +++ b/src/runtime/MemoryManagerOnDemand.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 Arm Limited. + * Copyright (c) 2016-2018, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,8 +24,10 @@ #include "arm_compute/runtime/MemoryManagerOnDemand.h" #include "arm_compute/core/Error.h" +#include "arm_compute/runtime/BlobLifetimeManager.h" #include "arm_compute/runtime/ILifetimeManager.h" #include "arm_compute/runtime/IPoolManager.h" +#include "arm_compute/runtime/PoolManager.h" #include @@ -71,4 +73,13 @@ void MemoryManagerOnDemand::clear() ARM_COMPUTE_ERROR_ON_MSG(!_pool_mgr, "Pool manager not specified correctly!"); _pool_mgr->clear_pools(); } + +std::shared_ptr MemoryManagerOnDemand::make_default() +{ + auto lifetime_mgr = std::make_shared(); + auto pool_mgr = std::make_shared(); + auto mm = std::make_shared(lifetime_mgr, pool_mgr); + + return mm; +} } //namespace arm_compute diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index be451bcdeb..8a98437caf 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -136,6 +136,7 @@ void NEFullyConnectedLayer::prepare() if (!_impl->is_prepared) { allocate_tensors(_impl->aux_mem_req, _impl->workspace); + MemoryGroupResourceScope scope_mg(_impl->memory_group); _impl->op->prepare(_impl->run_pack); // Release temporary tensors that are only used in prepare stage diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index d26b819864..b64bbbe3ad 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -76,8 +76,9 @@ void NEGEMM::configure(const ITensor *a, // Check if we need to reshape the matrix B only on the first run _impl->is_prepared = false; - _impl->original_b = b; - _impl->op = std::make_unique(); + _impl->memory_group.mappings().clear(); + _impl->original_b = b; + _impl->op = std::make_unique(); // Make the B matrix dynamic values. auto b_info_to_use = b->info()->clone(); diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp index b5cdd864ba..a104ce02d1 100644 --- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -60,7 +60,8 @@ void NEGEMMConv2d::configure( _impl->weights = weights; _impl->is_prepared = false; - _impl->op = std::make_unique(); + _impl->memory_group.mappings().clear(); + _impl->op = std::make_unique(); _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info); diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index 03df5115f0..ffc73f0bc0 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -70,8 +70,9 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); _impl->is_prepared = false; - _impl->weights = weights; - _impl->op = std::make_unique(); + _impl->memory_group.mappings().clear(); + _impl->weights = weights; + _impl->op = std::make_unique(); _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 6d172cef27..1c730bd031 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -24,17 +24,20 @@ #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/DataTypeUtils.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/core/utils/quantization/AsymmHelpers.h" #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" +#include + using namespace arm_compute::experimental; namespace arm_compute @@ -49,6 +52,7 @@ struct NEGEMMLowpMatrixMultiplyCore::Impl IWeightsManager *weights_manager{nullptr}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; + ActivationLayerInfo act_info{}; bool is_prepared{false}; }; @@ -74,8 +78,9 @@ void NEGEMMLowpMatrixMultiplyCore::configure( } _impl->is_prepared = false; - _impl->b = b; - _impl->op = std::make_unique(); + _impl->memory_group.mappings().clear(); + _impl->b = b; + _impl->op = std::make_unique(); _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), gemm_info); _impl->run_pack = {{TensorType::ACL_SRC_0, a}, @@ -84,6 +89,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure( {TensorType::ACL_DST, output}}; _impl->prep_pack = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}}; _impl->aux_mem_req = _impl->op->workspace(); + _impl->act_info = gemm_info.activation_info(); _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack, /* allocate_now */ false); } @@ -106,6 +112,11 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, void NEGEMMLowpMatrixMultiplyCore::update_quantization_parameters() { + // Supported activations in GEMM + const std::set supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + auto src = _impl->run_pack.get_const_tensor(ACL_SRC_0); auto wei = _impl->run_pack.get_const_tensor(ACL_SRC_1); auto dst = _impl->run_pack.get_tensor(ACL_DST); @@ -114,14 +125,23 @@ void NEGEMMLowpMatrixMultiplyCore::update_quantization_parameters() const QuantizationInfo wqinfo = wei->info()->quantization_info(); const QuantizationInfo oqinfo = (dst->info()->total_size() == 0) ? iqinfo : dst->info()->quantization_info(); - int32_t min_activation = 0; - int32_t max_activation = 0; - std::tie(min_activation, max_activation) = - quantization::get_quantized_asymmetric_output_min_max(wqinfo, ActivationLayerInfo(), wei->info()->data_type()); + PixelValue type_min{}; + PixelValue type_max{}; + const DataType data_type = src->info()->data_type(); + std::tie(type_min, type_max) = get_min_max(data_type); + int32_t min_activation = type_min.get(); + int32_t max_activation = type_max.get(); + + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + if (supported_acts.find(_impl->act_info.activation()) != supported_acts.end()) + { + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(_impl->act_info, data_type, uoqinfo); + } GEMMLowpOutputStageInfo output_info; output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = oqinfo.uniform().offset; + output_info.gemmlowp_offset = uoqinfo.offset; output_info.gemmlowp_min_bound = min_activation; output_info.gemmlowp_max_bound = max_activation; output_info.is_quantized_per_channel = false; diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp index 31898bafc4..36ce7de262 100644 --- a/src/runtime/NEON/functions/NEMatMul.cpp +++ b/src/runtime/NEON/functions/NEMatMul.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Arm Limited. + * Copyright (c) 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,16 +34,23 @@ namespace arm_compute { struct NEMatMul::Impl { + Impl(std::shared_ptr memory_manager) : memory_group(memory_manager) + { + } + + Impl(const Impl &) = delete; + Impl &operator=(const Impl &) = delete; + const ITensor *lhs{nullptr}; const ITensor *rhs{nullptr}; ITensor *output{nullptr}; std::unique_ptr op{nullptr}; - MemoryGroup memory_group{}; + MemoryGroup memory_group; WorkspaceData workspace_tensors{}; ITensorPack run_pack{}; }; -NEMatMul::NEMatMul() : _impl(std::make_unique()) +NEMatMul::NEMatMul(std::shared_ptr memory_manager) : _impl(std::make_unique(memory_manager)) { } diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index b72aff577a..e3bfc1b2c2 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -69,7 +69,8 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ActivationLayerInfo &act_info, bool enable_fast_math) { - _impl->is_prepared = false; + _impl->is_prepared = false; + _impl->memory_group.mappings().clear(); _impl->original_weights = weights; _impl->op = std::make_unique(); _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), diff --git a/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp b/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp index ec0557ff4e..adda460c96 100644 --- a/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp @@ -79,11 +79,30 @@ void CpuGemmAssemblyDispatch::configure( Status CpuGemmAssemblyDispatch::validate( const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info) { - if (gemm_info.reinterpret_input_as_3d() != false || gemm_info.depth_output_gemm3d() != false || - gemm_info.reshape_b_only_on_first_run() != true) + if (gemm_info.reinterpret_input_as_3d() || gemm_info.depth_output_gemm3d() || + !gemm_info.reshape_b_only_on_first_run()) { - return Status(ErrorCode::RUNTIME_ERROR); + return Status(ErrorCode::RUNTIME_ERROR, "unsupported arguments in gemm_info"); } + bool a_data_type_ok = a->data_type() == DataType::F32 || a->data_type() == DataType::F16; + bool b_data_type_ok = b->data_type() == DataType::F32 || b->data_type() == DataType::F16; + bool c_data_type_ok = c == nullptr; + bool d_data_type_ok = d->data_type() == DataType::F32 || d->data_type() == DataType::F16; + bool bf16_ok = ((a->data_type() == DataType::BFLOAT16 && b->data_type() == DataType::BFLOAT16) || + b->data_type() == DataType::BFLOAT16) && + (d->data_type() == DataType::BFLOAT16 || d->data_type() == DataType::F32); + + bool fixed_format_dtype_ok = + (!gemm_info.fixed_format() || + (a->data_type() == DataType::F32 && b->data_type() == DataType::F32 && d->data_type() == DataType::F32) || + (a->data_type() == DataType::F16 && b->data_type() == DataType::F16 && d->data_type() == DataType::F16) || + bf16_ok); + + if (!((a_data_type_ok && b_data_type_ok && c_data_type_ok && d_data_type_ok && fixed_format_dtype_ok) || bf16_ok)) + { + return Status(ErrorCode::RUNTIME_ERROR, "datatype is not supported"); + } + return cpu::CpuGemmAssemblyDispatch::validate(a, b, c, d, init_assembly_metadata(gemm_info)); } diff --git a/src/runtime/experimental/operators/CpuDequantize.cpp b/src/runtime/experimental/operators/CpuDequantize.cpp new file mode 100644 index 0000000000..08592d54d3 --- /dev/null +++ b/src/runtime/experimental/operators/CpuDequantize.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017-2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/operators/CpuDequantize.h" + +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/experimental/operators/CpuDequantize.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace op +{ +struct CpuDequantize::Impl +{ + std::unique_ptr op{nullptr}; +}; + +CpuDequantize::CpuDequantize() : impl_(std::make_unique()) +{ +} +CpuDequantize::~CpuDequantize() = default; + +void CpuDequantize::configure(const ITensorInfo *input, ITensorInfo *output) +{ + impl_->op = std::make_unique(); + impl_->op->configure(input, output); +} + +Status CpuDequantize::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return cpu::CpuDequantize::validate(input, output); +} + +void CpuDequantize::run(ITensorPack &tensors) +{ + impl_->op->run(tensors); +} + +} // namespace op +} // namespace experimental +} // namespace arm_compute diff --git a/src/runtime/experimental/operators/CpuGEMMLowp.cpp b/src/runtime/experimental/operators/CpuGEMMLowp.cpp new file mode 100644 index 0000000000..57391fbca9 --- /dev/null +++ b/src/runtime/experimental/operators/CpuGEMMLowp.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/experimental/operators/CpuGEMMLowp.h" + +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +#include "src/core/utils/quantization/AsymmHelpers.h" +#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace op +{ +struct CpuGEMMLowp::Impl +{ + std::unique_ptr op{nullptr}; + bool is_prepared{false}; +}; + +CpuGEMMLowp::CpuGEMMLowp() : _impl(std::make_unique()) +{ + _impl->op = std::make_unique(); +} +CpuGEMMLowp::~CpuGEMMLowp() = default; + +experimental::MemoryRequirements CpuGEMMLowp::workspace() const +{ + return _impl->op->workspace(); +} + +void CpuGEMMLowp::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + + // Make the B matrix dynamic values. + auto b_info_to_use = b->clone(); + if (!gemm_info.reshape_b_only_on_first_run()) + { + b_info_to_use->set_are_values_constant(false); + } + + _impl->is_prepared = false; + _impl->op->configure(a, b_info_to_use.get(), (c != nullptr ? c : nullptr), output, gemm_info); +} + +Status CpuGEMMLowp::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) +{ + // Make the B matrix dynamic values. + auto b_info_to_use = b->clone(); + if (!gemm_info.reshape_b_only_on_first_run()) + { + b_info_to_use->set_are_values_constant(false); + } + + return cpu::CpuGemmLowpMatrixMultiplyCore::validate(a, b_info_to_use.get(), c, output, gemm_info); +} + +void CpuGEMMLowp::run(ITensorPack &tensors) +{ + prepare(tensors); + _impl->op->run(tensors); +} + +void CpuGEMMLowp::prepare(ITensorPack &tensors) +{ + if (!_impl->is_prepared) + { + _impl->op->prepare(tensors); + + auto aux_mem_req = _impl->op->workspace(); + + auto has_reshape = + std::find_if(aux_mem_req.begin(), aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + + if (has_reshape != std::end(aux_mem_req)) + { + auto b = tensors.get_tensor(TensorType::ACL_SRC_1); + b->mark_as_unused(); + } + + _impl->is_prepared = true; + } +} +} // namespace op +} // namespace experimental +} // namespace arm_compute diff --git a/src/runtime/experimental/operators/CpuGemmConv2d.cpp b/src/runtime/experimental/operators/CpuGemmConv2d.cpp index 7253f6e0f1..3174dd30a8 100644 --- a/src/runtime/experimental/operators/CpuGemmConv2d.cpp +++ b/src/runtime/experimental/operators/CpuGemmConv2d.cpp @@ -90,6 +90,11 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo weights_info, dilation, act_info, enable_fast_math); } +void CpuGemmConv2d::update_quantization_parameters(ITensorPack &tensors) +{ + _impl->op->update_quantization_parameters(tensors); +} + void CpuGemmConv2d::run(ITensorPack &tensors) { prepare(tensors); diff --git a/src/runtime/experimental/operators/CpuQuantize.cpp b/src/runtime/experimental/operators/CpuQuantize.cpp new file mode 100644 index 0000000000..59a65d3611 --- /dev/null +++ b/src/runtime/experimental/operators/CpuQuantize.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2017-2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/operators/CpuQuantize.h" + +#include "arm_compute/runtime/experimental/operators/CpuQuantize.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace op +{ +struct CpuQuantize::Impl +{ + std::unique_ptr op{nullptr}; +}; + +CpuQuantize::CpuQuantize() : impl_(std::make_unique()) +{ +} +CpuQuantize::~CpuQuantize() = default; + +Status CpuQuantize::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return cpu::CpuQuantize::validate(input, output); +} + +void CpuQuantize::configure(const ITensorInfo *input, ITensorInfo *output) +{ + impl_->op = std::make_unique(); + impl_->op->configure(input, output); +} + +void CpuQuantize::run(ITensorPack &pack) +{ + impl_->op->run(pack); +} +} // namespace op +} // namespace experimental +} // namespace arm_compute diff --git a/support/Bfloat16.h b/support/Bfloat16.h index 7c5ef78848..d23cee5fe7 100644 --- a/support/Bfloat16.h +++ b/support/Bfloat16.h @@ -102,7 +102,7 @@ class bfloat16 final * * @param[in] v Floating-point value */ - bfloat16(float v) : value(float_to_bf16(v)) + explicit bfloat16(float v) : value(float_to_bf16(v)) { } /** Constructor diff --git a/support/SaturateCast.h b/support/SaturateCast.h index 7af9f983ed..64a2157afe 100644 --- a/support/SaturateCast.h +++ b/support/SaturateCast.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H -#define ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H +#ifndef ACL_SUPPORT_SATURATECAST_H +#define ACL_SUPPORT_SATURATECAST_H #include "arm_compute/core/utils/misc/Traits.h" #include "arm_compute/core/utils/misc/Utility.h" @@ -190,6 +190,18 @@ inline T saturate_cast(U v) return saturate_cast(vi); } +// float -> int +template::value && + traits::is_floating_point::value, + int >::type = 0 > +inline T saturate_static_cast(U v) +{ + int32_t vi = static_cast(v); + return saturate_cast(vi); +} + // int -> float template #ifndef DOXYGEN_SKIP_THIS @@ -3061,4 +3063,4 @@ enum #endif /* DOXYGEN_SKIP_THIS */ -#endif /* ARM_COMPUTE_TEST_HWC_NAMES */ +#endif // ACL_TESTS_FRAMEWORK_INSTRUMENTS_HWC_NAMES_HPP diff --git a/tests/validation/CL/Cast.cpp b/tests/validation/CL/Cast.cpp index 2f943e84d8..46238b431a 100644 --- a/tests/validation/CL/Cast.cpp +++ b/tests/validation/CL/Cast.cpp @@ -26,8 +26,8 @@ #include "arm_compute/runtime/CL/CLTensorAllocator.h" #include "arm_compute/runtime/CL/functions/CLCast.h" #include "tests/CL/CLAccessor.h" -#include "tests/PaddingCalculator.h" #include "tests/datasets/ConvertPolicyDataset.h" +#include "tests/datasets/DatatypeDataset.h" #include "tests/datasets/ShapeDatasets.h" #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" @@ -35,12 +35,17 @@ #include "tests/validation/Validation.h" #include "tests/validation/fixtures/CastFixture.h" +#include +#include + namespace arm_compute { namespace test { namespace validation { + +using framework::dataset::make; namespace { // Tolerance @@ -49,100 +54,176 @@ constexpr AbsoluteTolerance zero_tolerance(0); /** Input data sets **/ // QASYMM8 -const auto CastQASYMM8toF32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::F32)); -const auto CastQSYMM8toF32Dataset = combine(framework::dataset::make("DataType", DataType::QSYMM8), framework::dataset::make("DataType", DataType::F32)); +const auto CastQASYMM8toF32Dataset = combine(make("DataType", DataType::QASYMM8), make("DataType", DataType::F32)); +const auto CastQSYMM8toF32Dataset = combine(make("DataType", DataType::QSYMM8), make("DataType", DataType::F32)); + +#define U8Types DataType::U8, DataType::QASYMM8 +#define S8Types DataType::S8, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::QASYMM8_SIGNED +#define S8Types_wo_q8_pc DataType::S8, DataType::QSYMM8, DataType::QASYMM8_SIGNED +#define U16Types DataType::QASYMM16, DataType::U16 +#define S16Types DataType::S16, DataType::QSYMM16 // U8 -const auto CastU8toS8Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S8)); -const auto CastU8toU16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U16)); -const auto CastU8toS16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S16)); -const auto CastU8toU32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U32)); -const auto CastU8toS32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S32)); -const auto CastU8toF16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::F16)); -const auto CastU8toF32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::F32)); +const auto CastU8toS8Dataset = combine(make("DataType", {U8Types}), make("DataType", {S8Types})); +const auto CastU8toU16Dataset = combine(make("DataType", {U8Types}), make("DataType", {U16Types})); +const auto CastU8toS16Dataset = combine(make("DataType", {U8Types}), make("DataType", {S16Types})); +const auto CastU8toU32Dataset = combine(make("DataType", {U8Types}), make("DataType", DataType::U32)); +const auto CastU8toS32Dataset = combine(make("DataType", {U8Types}), make("DataType", DataType::S32)); +const auto CastU8toF16Dataset = combine(make("DataType", {U8Types}), make("DataType", DataType::F16)); +const auto CastU8toF32Dataset = combine(make("DataType", {U8Types}), make("DataType", DataType::F32)); // S8 -const auto CastS8toU8Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::U8)); -const auto CastS8toU16Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::U16)); -const auto CastS8toS16Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::S16)); -const auto CastS8toU32Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::U32)); -const auto CastS8toS32Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::S32)); -const auto CastS8toF16Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::F16)); -const auto CastS8toF32Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::F32)); +const auto CastS8toU8Dataset = combine(make("DataType", {S8Types_wo_q8_pc}), make("DataType", {U8Types})); +const auto CastQSYMM8_PER_CHANNELtoU8Dataset = combine(make("DataType", DataType::QSYMM8_PER_CHANNEL), make("DataType", DataType::U8)); +const auto CastS8toU16Dataset = combine(make("DataType", {S8Types}), make("DataType", {U16Types})); +const auto CastS8toS16Dataset = combine(make("DataType", {S8Types}), make("DataType", {S16Types})); +const auto CastS8toU32Dataset = combine(make("DataType", {S8Types}), make("DataType", DataType::U32)); +const auto CastS8toS32Dataset = combine(make("DataType", {S8Types}), make("DataType", DataType::S32)); +const auto CastS8toF16Dataset = combine(make("DataType", {S8Types}), make("DataType", DataType::F16)); +const auto CastS8toF32Dataset = combine(make("DataType", {S8Types}), make("DataType", DataType::F32)); // U16 -const auto CastU16toU8Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U8)); -const auto CastU16toS8Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::S8)); -const auto CastU16toS16Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::S16)); -const auto CastU16toU32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U32)); -const auto CastU16toS32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::S32)); -const auto CastU16toF16Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::F16)); -const auto CastU16toF32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::F32)); +const auto CastU16toU8Dataset = combine(make("DataType", {U16Types}), make("DataType", {U8Types})); +const auto CastU16toS8Dataset = combine(make("DataType", {U16Types}), make("DataType", {S8Types})); +const auto CastU16toS16Dataset = combine(make("DataType", {U16Types}), make("DataType", {S16Types})); +const auto CastU16toU32Dataset = combine(make("DataType", {U16Types}), make("DataType", DataType::U32)); +const auto CastU16toS32Dataset = combine(make("DataType", {U16Types}), make("DataType", DataType::S32)); +const auto CastU16toF16Dataset = combine(make("DataType", {U16Types}), make("DataType", DataType::F16)); +const auto CastU16toF32Dataset = combine(make("DataType", {U16Types}), make("DataType", DataType::F32)); // S16 -const auto CastS16toU8Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U8)); -const auto CastS16toS8Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S8)); -const auto CastS16toU16Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U16)); -const auto CastS16toU32Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U32)); -const auto CastS16toS32Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S32)); -const auto CastS16toF16Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::F16)); -const auto CastS16toF32Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::F32)); +const auto CastS16toU8Dataset = combine(make("DataType", {S16Types}), make("DataType", {U8Types})); +const auto CastS16toS8Dataset = combine(make("DataType", {S16Types}), make("DataType", {S8Types})); +const auto CastS16toU16Dataset = combine(make("DataType", {S16Types}), make("DataType", {U16Types})); +const auto CastS16toU32Dataset = combine(make("DataType", {S16Types}), make("DataType", DataType::U32)); +const auto CastS16toS32Dataset = combine(make("DataType", {S16Types}), make("DataType", DataType::S32)); +const auto CastS16toF16Dataset = combine(make("DataType", {S16Types}), make("DataType", DataType::F16)); +const auto CastS16toF32Dataset = combine(make("DataType", {S16Types}), make("DataType", DataType::F32)); // U32 -const auto CastU32toU8Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::U8)); -const auto CastU32toS8Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::S8)); -const auto CastU32toU16Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::U16)); -const auto CastU32toS16Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::S16)); -const auto CastU32toS32Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::S32)); -const auto CastU32toF16Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::F16)); -const auto CastU32toF32Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::F32)); +const auto CastU32toU8Dataset = combine(make("DataType", DataType::U32), make("DataType", {U8Types})); +const auto CastU32toS8Dataset = combine(make("DataType", DataType::U32), make("DataType", {S8Types})); +const auto CastU32toU16Dataset = combine(make("DataType", DataType::U32), make("DataType", {U16Types})); +const auto CastU32toS16Dataset = combine(make("DataType", DataType::U32), make("DataType", {S16Types})); +const auto CastU32toS32Dataset = combine(make("DataType", DataType::U32), make("DataType", DataType::S32)); +const auto CastU32toF16Dataset = combine(make("DataType", DataType::U32), make("DataType", DataType::F16)); +const auto CastU32toF32Dataset = combine(make("DataType", DataType::U32), make("DataType", DataType::F32)); // S32 -const auto CastS32toU8Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U8)); -const auto CastS32toS8Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::S8)); -const auto CastS32toU16Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U16)); -const auto CastS32toS16Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::S16)); -const auto CastS32toU32Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U32)); -const auto CastS32toF16Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F16)); -const auto CastS32toF32Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F32)); +const auto CastS32toU8Dataset = combine(make("DataType", DataType::S32), make("DataType", {U8Types})); +const auto CastS32toS8Dataset = combine(make("DataType", DataType::S32), make("DataType", {S8Types})); +const auto CastS32toU16Dataset = combine(make("DataType", DataType::S32), make("DataType", {U16Types})); +const auto CastS32toS16Dataset = combine(make("DataType", DataType::S32), make("DataType", {S16Types})); +const auto CastS32toU32Dataset = combine(make("DataType", DataType::S32), make("DataType", DataType::U32)); +const auto CastS32toF16Dataset = combine(make("DataType", DataType::S32), make("DataType", DataType::F16)); +const auto CastS32toF32Dataset = combine(make("DataType", DataType::S32), make("DataType", DataType::F32)); // F16 -const auto CastF16toU8Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U8)); -const auto CastF16toS8Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S8)); -const auto CastF16toU16Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U16)); -const auto CastF16toS16Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S16)); -const auto CastF16toU32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U32)); -const auto CastF16toS32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S32)); -const auto CastF16toF32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32)); +const auto CastF16toU8Dataset = combine(make("DataType", DataType::F16), make("DataType", {U8Types})); +const auto CastF16toS8Dataset = combine(make("DataType", DataType::F16), make("DataType", {S8Types})); +const auto CastF16toU16Dataset = combine(make("DataType", DataType::F16), make("DataType", {U16Types})); +const auto CastF16toS16Dataset = combine(make("DataType", DataType::F16), make("DataType", {S16Types})); +const auto CastF16toU32Dataset = combine(make("DataType", DataType::F16), make("DataType", DataType::U32)); +const auto CastF16toS32Dataset = combine(make("DataType", DataType::F16), make("DataType", DataType::S32)); +const auto CastF16toF32Dataset = combine(make("DataType", DataType::F16), make("DataType", DataType::F32)); // F32 -const auto CastF32toU8Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U8)); -const auto CastF32toS8Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S8)); -const auto CastF32toU16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U16)); -const auto CastF32toS16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S16)); -const auto CastF32toU32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U32)); -const auto CastF32toS32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S32)); -const auto CastF32toF16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16)); +const auto CastF32toU8Dataset = combine(make("DataType", DataType::F32), make("DataType", {U8Types})); +const auto CastF32toS8Dataset = combine(make("DataType", DataType::F32), make("DataType", {S8Types})); +const auto CastF32toU16Dataset = combine(make("DataType", DataType::F32), make("DataType", {U16Types})); +const auto CastF32toS16Dataset = combine(make("DataType", DataType::F32), make("DataType", {S16Types})); +const auto CastF32toU32Dataset = combine(make("DataType", DataType::F32), make("DataType", DataType::U32)); +const auto CastF32toS32Dataset = combine(make("DataType", DataType::F32), make("DataType", DataType::S32)); +const auto CastF32toF16Dataset = combine(make("DataType", DataType::F32), make("DataType", DataType::F16)); // U64 -const auto CastU64toU8Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U8)); -const auto CastU64toS8Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S8)); -const auto CastU64toU16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U16)); -const auto CastU64toS16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S16)); -const auto CastU64toU32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U32)); -const auto CastU64toS32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S32)); -const auto CastU64toF16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F16)); -const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32)); +const auto CastU64toU8Dataset = combine(make("DataType", DataType::U64), make("DataType", {U8Types})); +const auto CastU64toS8Dataset = combine(make("DataType", DataType::U64), make("DataType", {S8Types})); +const auto CastU64toU16Dataset = combine(make("DataType", DataType::U64), make("DataType", {U16Types})); +const auto CastU64toS16Dataset = combine(make("DataType", DataType::U64), make("DataType", {S16Types})); +const auto CastU64toU32Dataset = combine(make("DataType", DataType::U64), make("DataType", DataType::U32)); +const auto CastU64toS32Dataset = combine(make("DataType", DataType::U64), make("DataType", DataType::S32)); +const auto CastU64toF16Dataset = combine(make("DataType", DataType::U64), make("DataType", DataType::F16)); +const auto CastU64toF32Dataset = combine(make("DataType", DataType::U64), make("DataType", DataType::F32)); // S64 -const auto CastS64toU8Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U8)); -const auto CastS64toS8Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S8)); -const auto CastS64toU16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U16)); -const auto CastS64toS16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S16)); -const auto CastS64toU32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U32)); -const auto CastS64toS32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S32)); -const auto CastS64toF16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F16)); -const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32)); +const auto CastS64toU8Dataset = combine(make("DataType", DataType::S64), make("DataType", {U8Types})); +const auto CastS64toS8Dataset = combine(make("DataType", DataType::S64), make("DataType", {S8Types})); +const auto CastS64toU16Dataset = combine(make("DataType", DataType::S64), make("DataType", {U16Types})); +const auto CastS64toS16Dataset = combine(make("DataType", DataType::S64), make("DataType", {S16Types})); +const auto CastS64toU32Dataset = combine(make("DataType", DataType::S64), make("DataType", DataType::U32)); +const auto CastS64toS32Dataset = combine(make("DataType", DataType::S64), make("DataType", DataType::S32)); +const auto CastS64toF16Dataset = combine(make("DataType", DataType::S64), make("DataType", DataType::F16)); +const auto CastS64toF32Dataset = combine(make("DataType", DataType::S64), make("DataType", DataType::F32)); + +void validate_data_types(DataType input_dtype, DataType output_dtype) +{ + const auto input = TensorInfo(TensorShape(16U, 16U, 5U), 1, input_dtype); + auto output = TensorInfo(TensorShape(16U, 16U, 5U), 1, output_dtype); + + const Status status = (CLCast::validate(&input, &output, ConvertPolicy::SATURATE)); + const bool is_valid = static_cast(status); + + static std::map> supported_dtypes; + + supported_dtypes[DataType::U8] = { + S8Types, U16Types, S16Types, DataType::U32, DataType::S32, DataType::F16, DataType::F32}; + + supported_dtypes[DataType::S8] = { + U8Types, U16Types, S16Types, DataType::U32, DataType::S32, DataType::F16, DataType::F32}; + + supported_dtypes[DataType::U16] = { + DataType::U8, DataType::S8, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32, + DataType::QSYMM8, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::QASYMM8_SIGNED, DataType::QSYMM16}; + + supported_dtypes[DataType::S16] = { + S8Types, U8Types, U16Types, DataType::U32, DataType::S32, DataType::F16, DataType::F32}; + + supported_dtypes[DataType::U32] = { + S8Types, U8Types, U16Types, S16Types, DataType::S32, DataType::F16, DataType::F32}; + + supported_dtypes[DataType::S32] = { + S8Types, U8Types, U16Types, S16Types, DataType::U32, DataType::F16, DataType::F32}; + + supported_dtypes[DataType::U64] = { + S8Types, U8Types, U16Types, S16Types, DataType::U32, DataType::S32, + DataType::F16, DataType::F32}; + + supported_dtypes[DataType::S64] = { + S8Types, U8Types, U16Types, S16Types, DataType::U32, DataType::S32, + DataType::F16, DataType::F32}; + + supported_dtypes[DataType::F16] = { + S8Types, U8Types, U16Types, S16Types, DataType::U32, DataType::S32, DataType::F32}; + + supported_dtypes[DataType::F32] = { + S8Types, U8Types, U16Types, S16Types, DataType::U32, DataType::S32, DataType::F16}; + + supported_dtypes[DataType::QSYMM8] = supported_dtypes[DataType::S8]; + supported_dtypes[DataType::QASYMM8_SIGNED] = supported_dtypes[DataType::S8]; + supported_dtypes[DataType::QSYMM8_PER_CHANNEL] = { + U16Types, S16Types, DataType::U8, DataType::U32, DataType::S32, DataType::F16, DataType::F32 + }; + + supported_dtypes[DataType::QASYMM8] = supported_dtypes[DataType::U8]; + + supported_dtypes[DataType::QSYMM16] = supported_dtypes[DataType::S16]; + supported_dtypes[DataType::QASYMM16] = supported_dtypes[DataType::U16]; + + bool expected = false; + if(supported_dtypes.find(input_dtype) != supported_dtypes.end()) + { + const auto supports = supported_dtypes[input_dtype]; + expected = (std::find(supports.begin(), supports.end(), output_dtype) != supports.end()); + } + + ARM_COMPUTE_EXPECT_EQUAL(is_valid, expected, framework::LogLevel::ERRORS); + + if(is_valid != expected) + { + std::cout << status.error_description() << std::endl; + } +} } // namespace TEST_SUITE(CL) @@ -164,7 +245,17 @@ using CLCastToF16Fixture = CastValidationFixture using CLCastToF32Fixture = CastValidationFixture; -#define CAST_SUITE(NAME, idt, odt, type, dataset, tolerance) \ +DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::ALL, + combine( + datasets::AllDataTypes("InputDataType"), + datasets::AllDataTypes("OutputDataType")), + input_dtype, output_dtype) +{ + validate_data_types(input_dtype, output_dtype); +} + + +#define CAST_SUITE(NAME, type, dataset, tolerance) \ TEST_SUITE(NAME) \ FIXTURE_DATA_TEST_CASE(RunSmall, type, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), dataset), \ datasets::ConvertPolicies())) \ @@ -173,103 +264,97 @@ using CLCastToF32Fixture = CastValidationFixture, CastQASYMM8toF32Dataset, zero_tolerance) -// QSYMM8 -CAST_SUITE(QSYMM8_to_F32, DataType::QSYMM8, DataType::F32, CLCastToF32Fixture, CastQSYMM8toF32Dataset, zero_tolerance) - - // U8 -CAST_SUITE(U8_to_S8, DataType::U8, DataType::S8, CLCastToS8Fixture, CastU8toS8Dataset, zero_tolerance) -CAST_SUITE(U8_to_U16, DataType::U8, DataType::U16, CLCastToU16Fixture, CastU8toU16Dataset, zero_tolerance) -CAST_SUITE(U8_to_S16, DataType::U8, DataType::S16, CLCastToS16Fixture, CastU8toS16Dataset, zero_tolerance) -CAST_SUITE(U8_to_U32, DataType::U8, DataType::U32, CLCastToU32Fixture, CastU8toU32Dataset, zero_tolerance) -CAST_SUITE(U8_to_S32, DataType::U8, DataType::S32, CLCastToS32Fixture, CastU8toS32Dataset, zero_tolerance) -CAST_SUITE(U8_to_F16, DataType::U8, DataType::F16, CLCastToF16Fixture, CastU8toF16Dataset, zero_tolerance) -CAST_SUITE(U8_to_F32, DataType::U8, DataType::F32, CLCastToF32Fixture, CastU8toF32Dataset, zero_tolerance) +CAST_SUITE(U8_to_S8, CLCastToS8Fixture, CastU8toS8Dataset, zero_tolerance) +CAST_SUITE(U8_to_U16, CLCastToU16Fixture, CastU8toU16Dataset, zero_tolerance) +CAST_SUITE(U8_to_S16, CLCastToS16Fixture, CastU8toS16Dataset, zero_tolerance) +CAST_SUITE(U8_to_U32, CLCastToU32Fixture, CastU8toU32Dataset, zero_tolerance) +CAST_SUITE(U8_to_S32, CLCastToS32Fixture, CastU8toS32Dataset, zero_tolerance) +CAST_SUITE(U8_to_F16, CLCastToF16Fixture, CastU8toF16Dataset, zero_tolerance) +CAST_SUITE(U8_to_F32, CLCastToF32Fixture, CastU8toF32Dataset, zero_tolerance) // S8 -CAST_SUITE(S8_to_U8, DataType::S8, DataType::U8, CLCastToU8Fixture, CastS8toU8Dataset, zero_tolerance) -CAST_SUITE(S8_to_U16, DataType::S8, DataType::U16, CLCastToU16Fixture, CastS8toU16Dataset, zero_tolerance) -CAST_SUITE(S8_to_S16, DataType::S8, DataType::S16, CLCastToS16Fixture, CastS8toS16Dataset, zero_tolerance) -CAST_SUITE(S8_to_U32, DataType::S8, DataType::U32, CLCastToU32Fixture, CastS8toU32Dataset, zero_tolerance) -CAST_SUITE(S8_to_S32, DataType::S8, DataType::S32, CLCastToS32Fixture, CastS8toS32Dataset, zero_tolerance) -CAST_SUITE(S8_to_F16, DataType::S8, DataType::F16, CLCastToF16Fixture, CastS8toF16Dataset, zero_tolerance) -CAST_SUITE(S8_to_F32, DataType::S8, DataType::F32, CLCastToF32Fixture, CastS8toF32Dataset, zero_tolerance) +CAST_SUITE(S8_to_U8, CLCastToU8Fixture, CastS8toU8Dataset, zero_tolerance) +CAST_SUITE(S8_to_U16, CLCastToU16Fixture, CastS8toU16Dataset, zero_tolerance) +CAST_SUITE(S8_to_S16, CLCastToS16Fixture, CastS8toS16Dataset, zero_tolerance) +CAST_SUITE(S8_to_U32, CLCastToU32Fixture, CastS8toU32Dataset, zero_tolerance) +CAST_SUITE(S8_to_S32, CLCastToS32Fixture, CastS8toS32Dataset, zero_tolerance) +CAST_SUITE(S8_to_F16, CLCastToF16Fixture, CastS8toF16Dataset, zero_tolerance) +CAST_SUITE(S8_to_F32, CLCastToF32Fixture, CastS8toF32Dataset, zero_tolerance) // U16 -CAST_SUITE(U16_to_U8, DataType::U16, DataType::U8, CLCastToU8Fixture, CastU16toU8Dataset, zero_tolerance) -CAST_SUITE(U16_to_S8, DataType::U16, DataType::S8, CLCastToS8Fixture, CastU16toS8Dataset, zero_tolerance) -CAST_SUITE(U16_to_S16, DataType::U16, DataType::S16, CLCastToS16Fixture, CastU16toS16Dataset, zero_tolerance) -CAST_SUITE(U16_to_U32, DataType::U16, DataType::U32, CLCastToU32Fixture, CastU16toU32Dataset, zero_tolerance) -CAST_SUITE(U16_to_S32, DataType::U16, DataType::S32, CLCastToS32Fixture, CastU16toS32Dataset, zero_tolerance) -CAST_SUITE(U16_to_F16, DataType::U16, DataType::F16, CLCastToF16Fixture, CastU16toF16Dataset, zero_tolerance) -CAST_SUITE(U16_to_F32, DataType::U16, DataType::F32, CLCastToF32Fixture, CastU16toF32Dataset, zero_tolerance) +CAST_SUITE(U16_to_U8, CLCastToU8Fixture, CastU16toU8Dataset, zero_tolerance) +CAST_SUITE(U16_to_S8, CLCastToS8Fixture, CastU16toS8Dataset, zero_tolerance) +CAST_SUITE(U16_to_S16, CLCastToS16Fixture, CastU16toS16Dataset, zero_tolerance) +CAST_SUITE(U16_to_U32, CLCastToU32Fixture, CastU16toU32Dataset, zero_tolerance) +CAST_SUITE(U16_to_S32, CLCastToS32Fixture, CastU16toS32Dataset, zero_tolerance) +CAST_SUITE(U16_to_F16, CLCastToF16Fixture, CastU16toF16Dataset, zero_tolerance) +CAST_SUITE(U16_to_F32, CLCastToF32Fixture, CastU16toF32Dataset, zero_tolerance) // S16 -CAST_SUITE(S16_to_U8, DataType::S16, DataType::U8, CLCastToU8Fixture, CastS16toU8Dataset, zero_tolerance) -CAST_SUITE(S16_to_S8, DataType::S16, DataType::S8, CLCastToS8Fixture, CastS16toS8Dataset, zero_tolerance) -CAST_SUITE(S16_to_U16, DataType::S16, DataType::U16, CLCastToU16Fixture, CastS16toU16Dataset, zero_tolerance) -CAST_SUITE(S16_to_U32, DataType::S16, DataType::U32, CLCastToU32Fixture, CastS16toU32Dataset, zero_tolerance) -CAST_SUITE(S16_to_S32, DataType::S16, DataType::S32, CLCastToS32Fixture, CastS16toS32Dataset, zero_tolerance) -CAST_SUITE(S16_to_F16, DataType::S16, DataType::F16, CLCastToF16Fixture, CastS16toF16Dataset, zero_tolerance) -CAST_SUITE(S16_to_F32, DataType::S16, DataType::F32, CLCastToF32Fixture, CastS16toF32Dataset, zero_tolerance) +CAST_SUITE(S16_to_U8, CLCastToU8Fixture, CastS16toU8Dataset, zero_tolerance) +CAST_SUITE(S16_to_S8, CLCastToS8Fixture, CastS16toS8Dataset, zero_tolerance) +CAST_SUITE(S16_to_U16, CLCastToU16Fixture, CastS16toU16Dataset, zero_tolerance) +CAST_SUITE(S16_to_U32, CLCastToU32Fixture, CastS16toU32Dataset, zero_tolerance) +CAST_SUITE(S16_to_S32, CLCastToS32Fixture, CastS16toS32Dataset, zero_tolerance) +CAST_SUITE(S16_to_F16, CLCastToF16Fixture, CastS16toF16Dataset, zero_tolerance) +CAST_SUITE(S16_to_F32, CLCastToF32Fixture, CastS16toF32Dataset, zero_tolerance) // U32 -CAST_SUITE(U32_to_U8, DataType::U32, DataType::U8, CLCastToU8Fixture, CastU32toU8Dataset, zero_tolerance) -CAST_SUITE(U32_to_S8, DataType::U32, DataType::S8, CLCastToS8Fixture, CastU32toS8Dataset, zero_tolerance) -CAST_SUITE(U32_to_U16, DataType::U32, DataType::U16, CLCastToU16Fixture, CastU32toU16Dataset, zero_tolerance) -CAST_SUITE(U32_to_S16, DataType::U32, DataType::S16, CLCastToS16Fixture, CastU32toS16Dataset, zero_tolerance) -CAST_SUITE(U32_to_S32, DataType::U32, DataType::S32, CLCastToS32Fixture, CastU32toS32Dataset, zero_tolerance) -CAST_SUITE(U32_to_F16, DataType::U32, DataType::F16, CLCastToF16Fixture, CastU32toF16Dataset, zero_tolerance) -CAST_SUITE(U32_to_F32, DataType::U32, DataType::F32, CLCastToF32Fixture, CastU32toF32Dataset, zero_tolerance) +CAST_SUITE(U32_to_U8, CLCastToU8Fixture, CastU32toU8Dataset, zero_tolerance) +CAST_SUITE(U32_to_S8, CLCastToS8Fixture, CastU32toS8Dataset, zero_tolerance) +CAST_SUITE(U32_to_U16, CLCastToU16Fixture, CastU32toU16Dataset, zero_tolerance) +CAST_SUITE(U32_to_S16, CLCastToS16Fixture, CastU32toS16Dataset, zero_tolerance) +CAST_SUITE(U32_to_S32, CLCastToS32Fixture, CastU32toS32Dataset, zero_tolerance) +CAST_SUITE(U32_to_F16, CLCastToF16Fixture, CastU32toF16Dataset, zero_tolerance) +CAST_SUITE(U32_to_F32, CLCastToF32Fixture, CastU32toF32Dataset, zero_tolerance) // S32 -CAST_SUITE(S32_to_U8, DataType::S32, DataType::U8, CLCastToU8Fixture, CastS32toU8Dataset, zero_tolerance) -CAST_SUITE(S32_to_S8, DataType::S32, DataType::S8, CLCastToS8Fixture, CastS32toS8Dataset, zero_tolerance) -CAST_SUITE(S32_to_U16, DataType::S32, DataType::U16, CLCastToU16Fixture, CastS32toU16Dataset, zero_tolerance) -CAST_SUITE(S32_to_S16, DataType::S32, DataType::S16, CLCastToS16Fixture, CastS32toS16Dataset, zero_tolerance) -CAST_SUITE(S32_to_U32, DataType::S32, DataType::U32, CLCastToU32Fixture, CastS32toU32Dataset, zero_tolerance) -CAST_SUITE(S32_to_F16, DataType::S32, DataType::F16, CLCastToF16Fixture, CastS32toF16Dataset, zero_tolerance) -CAST_SUITE(S32_to_F32, DataType::S32, DataType::F32, CLCastToF32Fixture, CastS32toF32Dataset, zero_tolerance) +CAST_SUITE(S32_to_U8, CLCastToU8Fixture, CastS32toU8Dataset, zero_tolerance) +CAST_SUITE(S32_to_S8, CLCastToS8Fixture, CastS32toS8Dataset, zero_tolerance) +CAST_SUITE(S32_to_U16, CLCastToU16Fixture, CastS32toU16Dataset, zero_tolerance) +CAST_SUITE(S32_to_S16, CLCastToS16Fixture, CastS32toS16Dataset, zero_tolerance) +CAST_SUITE(S32_to_U32, CLCastToU32Fixture, CastS32toU32Dataset, zero_tolerance) +CAST_SUITE(S32_to_F16, CLCastToF16Fixture, CastS32toF16Dataset, zero_tolerance) +CAST_SUITE(S32_to_F32, CLCastToF32Fixture, CastS32toF32Dataset, zero_tolerance) // F16 -CAST_SUITE(F16_to_U8, DataType::F16, DataType::U8, CLCastToU8Fixture, CastF16toU8Dataset, one_tolerance) -CAST_SUITE(F16_to_S8, DataType::F16, DataType::S8, CLCastToS8Fixture, CastF16toS8Dataset, one_tolerance) -CAST_SUITE(F16_to_U16, DataType::F16, DataType::U16, CLCastToU16Fixture, CastF16toU16Dataset, one_tolerance) -CAST_SUITE(F16_to_S16, DataType::F16, DataType::S16, CLCastToS16Fixture, CastF16toS16Dataset, one_tolerance) -CAST_SUITE(F16_to_U32, DataType::F16, DataType::U32, CLCastToU32Fixture, CastF16toU32Dataset, one_tolerance) -CAST_SUITE(F16_to_S32, DataType::F16, DataType::S32, CLCastToS32Fixture, CastF16toS32Dataset, one_tolerance) -CAST_SUITE(F16_to_F32, DataType::F16, DataType::F32, CLCastToF32Fixture, CastF16toF32Dataset, zero_tolerance) +CAST_SUITE(F16_to_U8, CLCastToU8Fixture, CastF16toU8Dataset, one_tolerance) +CAST_SUITE(F16_to_S8, CLCastToS8Fixture, CastF16toS8Dataset, one_tolerance) +CAST_SUITE(F16_to_U16, CLCastToU16Fixture, CastF16toU16Dataset, one_tolerance) +CAST_SUITE(F16_to_S16, CLCastToS16Fixture, CastF16toS16Dataset, one_tolerance) +CAST_SUITE(F16_to_U32, CLCastToU32Fixture, CastF16toU32Dataset, one_tolerance) +CAST_SUITE(F16_to_S32, CLCastToS32Fixture, CastF16toS32Dataset, one_tolerance) +CAST_SUITE(F16_to_F32, CLCastToF32Fixture, CastF16toF32Dataset, zero_tolerance) // F32 -CAST_SUITE(F32_to_U8, DataType::F32, DataType::U8, CLCastToU8Fixture, CastF32toU8Dataset, one_tolerance) -CAST_SUITE(F32_to_S8, DataType::F32, DataType::S8, CLCastToS8Fixture, CastF32toS8Dataset, one_tolerance) -CAST_SUITE(F32_to_U16, DataType::F32, DataType::U16, CLCastToU16Fixture, CastF32toU16Dataset, one_tolerance) -CAST_SUITE(F32_to_S16, DataType::F32, DataType::S16, CLCastToS16Fixture, CastF32toS16Dataset, one_tolerance) -CAST_SUITE(F32_to_U32, DataType::F32, DataType::U32, CLCastToU32Fixture, CastF32toU32Dataset, one_tolerance) -CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, CLCastToS32Fixture, CastF32toS32Dataset, one_tolerance) -CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, CLCastToF16Fixture, CastF32toF16Dataset, zero_tolerance) +CAST_SUITE(F32_to_U8, CLCastToU8Fixture, CastF32toU8Dataset, one_tolerance) +CAST_SUITE(F32_to_S8, CLCastToS8Fixture, CastF32toS8Dataset, one_tolerance) +CAST_SUITE(F32_to_U16, CLCastToU16Fixture, CastF32toU16Dataset, one_tolerance) +CAST_SUITE(F32_to_S16, CLCastToS16Fixture, CastF32toS16Dataset, one_tolerance) +CAST_SUITE(F32_to_U32, CLCastToU32Fixture, CastF32toU32Dataset, one_tolerance) +CAST_SUITE(F32_to_S32, CLCastToS32Fixture, CastF32toS32Dataset, one_tolerance) +CAST_SUITE(F32_to_F16, CLCastToF16Fixture, CastF32toF16Dataset, zero_tolerance) // S64 -CAST_SUITE(S64_to_U8, DataType::S64, DataType::U8, CLCastToU8Fixture, CastS64toU8Dataset, one_tolerance) -CAST_SUITE(S64_to_S8, DataType::S64, DataType::S8, CLCastToS8Fixture, CastS64toS8Dataset, one_tolerance) -CAST_SUITE(S64_to_U16, DataType::S64, DataType::U16, CLCastToU16Fixture, CastS64toU16Dataset, one_tolerance) -CAST_SUITE(S64_to_S16, DataType::S64, DataType::S16, CLCastToS16Fixture, CastS64toS16Dataset, one_tolerance) -CAST_SUITE(S64_to_U32, DataType::S64, DataType::U32, CLCastToU32Fixture, CastS64toU32Dataset, one_tolerance) -CAST_SUITE(S64_to_S32, DataType::S64, DataType::S32, CLCastToS32Fixture, CastS64toS32Dataset, one_tolerance) -CAST_SUITE(S64_to_F16, DataType::S64, DataType::F16, CLCastToF16Fixture, CastS64toF16Dataset, zero_tolerance) -CAST_SUITE(S64_to_F32, DataType::S64, DataType::F32, CLCastToF32Fixture, CastS64toF32Dataset, zero_tolerance) +CAST_SUITE(S64_to_U8, CLCastToU8Fixture, CastS64toU8Dataset, one_tolerance) +CAST_SUITE(S64_to_S8, CLCastToS8Fixture, CastS64toS8Dataset, one_tolerance) +CAST_SUITE(S64_to_U16, CLCastToU16Fixture, CastS64toU16Dataset, one_tolerance) +CAST_SUITE(S64_to_S16, CLCastToS16Fixture, CastS64toS16Dataset, one_tolerance) +CAST_SUITE(S64_to_U32, CLCastToU32Fixture, CastS64toU32Dataset, one_tolerance) +CAST_SUITE(S64_to_S32, CLCastToS32Fixture, CastS64toS32Dataset, one_tolerance) +CAST_SUITE(S64_to_F16, CLCastToF16Fixture, CastS64toF16Dataset, zero_tolerance) +CAST_SUITE(S64_to_F32, CLCastToF32Fixture, CastS64toF32Dataset, zero_tolerance) // U64 -CAST_SUITE(U64_to_U8, DataType::U64, DataType::U8, CLCastToU8Fixture, CastU64toU8Dataset, one_tolerance) -CAST_SUITE(U64_to_S8, DataType::U64, DataType::S8, CLCastToS8Fixture, CastU64toS8Dataset, one_tolerance) -CAST_SUITE(U64_to_U16, DataType::U64, DataType::U16, CLCastToU16Fixture, CastU64toU16Dataset, one_tolerance) -CAST_SUITE(U64_to_S16, DataType::U64, DataType::S16, CLCastToS16Fixture, CastU64toS16Dataset, one_tolerance) -CAST_SUITE(U64_to_U32, DataType::U64, DataType::U32, CLCastToU32Fixture, CastU64toU32Dataset, one_tolerance) -CAST_SUITE(U64_to_S32, DataType::U64, DataType::S32, CLCastToS32Fixture, CastU64toS32Dataset, one_tolerance) -CAST_SUITE(U64_to_F16, DataType::U64, DataType::F16, CLCastToF16Fixture, CastU64toF16Dataset, zero_tolerance) -CAST_SUITE(U64_to_F32, DataType::U64, DataType::F32, CLCastToF32Fixture, CastU64toF32Dataset, zero_tolerance) +CAST_SUITE(U64_to_U8, CLCastToU8Fixture, CastU64toU8Dataset, one_tolerance) +CAST_SUITE(U64_to_S8, CLCastToS8Fixture, CastU64toS8Dataset, one_tolerance) +CAST_SUITE(U64_to_U16, CLCastToU16Fixture, CastU64toU16Dataset, one_tolerance) +CAST_SUITE(U64_to_S16, CLCastToS16Fixture, CastU64toS16Dataset, one_tolerance) +CAST_SUITE(U64_to_U32, CLCastToU32Fixture, CastU64toU32Dataset, one_tolerance) +CAST_SUITE(U64_to_S32, CLCastToS32Fixture, CastU64toS32Dataset, one_tolerance) +CAST_SUITE(U64_to_F16, CLCastToF16Fixture, CastU64toF16Dataset, zero_tolerance) +CAST_SUITE(U64_to_F32, CLCastToF32Fixture, CastU64toF32Dataset, zero_tolerance) TEST_SUITE_END() // Cast TEST_SUITE_END() // CL diff --git a/tests/validation/CL/LogSoftmaxLayer.cpp b/tests/validation/CL/LogSoftmaxLayer.cpp index b7f6a66e42..972d556ad2 100644 --- a/tests/validation/CL/LogSoftmaxLayer.cpp +++ b/tests/validation/CL/LogSoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -45,14 +45,24 @@ namespace /** Tolerance for float operations */ RelativeTolerance tolerance_f16(half(0.2)); RelativeTolerance tolerance_f32(0.001f); + +/** Tolerance for quantized operations */ +constexpr AbsoluteTolerance tolerance_qasymm8(1U); +constexpr AbsoluteTolerance tolerance_qasymm8_signed(1); + } // namespace +using framework::dataset::make; + TEST_SUITE(CL) TEST_SUITE(LogSoftmaxLayer) template using CLLogSoftmaxLayerFixture = SoftmaxValidationFixture; +template +using CLLogSoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture; + TEST_SUITE(Float) TEST_SUITE(FP16) FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(), @@ -108,6 +118,73 @@ FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerFixture, framework::Datase } TEST_SUITE_END() // FP32 TEST_SUITE_END() // Float + +TEST_SUITE(Quantized) +TEST_SUITE(QASYMM8_SIGNED) +FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine(datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, 1 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed); +} +FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, + combine(datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed); +} +FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine(datasets::SoftmaxLayer4DShapes(), + make("DataType", DataType::QASYMM8_SIGNED), + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, -4, 3 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed); +} +TEST_SUITE_END() // QASYMM8_SIGNED +TEST_SUITE(QASYMM8) +FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine(datasets::SoftmaxLayerSmallShapes(), + make("DataType", DataType::QASYMM8), + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, 1 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, + combine(datasets::SoftmaxLayerLargeShapes(), + make("DataType", DataType::QASYMM8), + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerQuantizedFixture, framework::DatasetMode::ALL, + combine(datasets::SoftmaxLayer4DShapes(), + make("DataType", DataType::QASYMM8), + make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0, -4, 3 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QASYMM8 +TEST_SUITE_END() // Quantized TEST_SUITE_END() // LogSoftmaxLayer TEST_SUITE_END() // CL } // namespace validation diff --git a/tests/validation/CL/QuantizationLayer.cpp b/tests/validation/CL/QuantizationLayer.cpp index 335d8df293..25ac2f7d41 100644 --- a/tests/validation/CL/QuantizationLayer.cpp +++ b/tests/validation/CL/QuantizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -45,14 +45,64 @@ namespace constexpr AbsoluteTolerance tolerance_f32(1.0f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ constexpr AbsoluteTolerance tolerance_u8(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8 data types */ constexpr AbsoluteTolerance tolerance_s8(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8_SIGNED data types */ +constexpr AbsoluteTolerance zero_tolerance_s8(0); constexpr AbsoluteTolerance tolerance_u16(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM16 data types */ const auto QuantizationSmallShapes = concat(datasets::Small3DShapes(), datasets::Small4DShapes()); const auto QuantizationLargeShapes = concat(datasets::Large3DShapes(), datasets::Large4DShapes()); + +void test_specific_case_int8(const std::vector &values, const std::vector &expected, + DataType dtype, const QuantizationInfo &in_qinfo, const QuantizationInfo &out_qinfo) +{ + // The test case here covers both Int8 and UInt8 because the underlying kernel is the same + const auto shape = TensorShape(values.size()); + + CLTensor input = create_tensor(shape, dtype, 1, in_qinfo); + CLTensor output = create_tensor(shape, dtype, 1, out_qinfo); + + CLQuantizationLayer quant_layer; + quant_layer.configure(&input, &output); + + input.allocator()->allocate(); + output.allocator()->allocate(); + + SimpleTensor ref {shape, dtype, 1, out_qinfo}; + + library->fill_static_values(CLAccessor(input), values); + library->fill_static_values(ref, expected); + + quant_layer.run(); + + validate(CLAccessor(output), ref, zero_tolerance_s8); +} } // namespace TEST_SUITE(CL) TEST_SUITE(QuantizationLayer) +TEST_CASE(ProperlyRoundedRequantizationLt16Elements, framework::DatasetMode::ALL) +{ + std::vector values = {1,3,5,7,9}; + std::vector expected = {0,1,2,3,4}; // (x + 1)/2 - 1 + + const auto dtype = DataType::QASYMM8_SIGNED; + const auto in_qinfo = QuantizationInfo(0.5f, -1); + const auto out_qinfo = QuantizationInfo(1.f, -1); + + test_specific_case_int8(values, expected, dtype, in_qinfo, out_qinfo); +} + +TEST_CASE(ProperlyRoundedRequantizationGt16Elements, framework::DatasetMode::ALL) +{ + std::vector values = {1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35}; + std::vector expected = {0,1,2,3,4,5 ,6 ,7 ,8 ,9 ,10,11,12,13,14,15,16,17}; // (x + 1)/2 - 1 + + const auto dtype = DataType::QASYMM8_SIGNED; + const auto in_qinfo = QuantizationInfo(0.5f, -1); + const auto out_qinfo = QuantizationInfo(1.f, -1); + + test_specific_case_int8(values, expected, dtype, in_qinfo, out_qinfo); +} + // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip( diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt index 59cd4b0a88..56aafcad27 100644 --- a/tests/validation/CMakeLists.txt +++ b/tests/validation/CMakeLists.txt @@ -20,141 +20,22 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +file(GLOB_RECURSE files_validation_unit "UNIT/*.cpp") +file(GLOB_RECURSE files_validation_cpp "CPP/*.cpp") + target_sources( arm_compute_validation - PRIVATE UNIT/SafeIntegerOps.cpp - UNIT/Version.cpp - UNIT/TensorInfo.cpp - UNIT/TensorShape.cpp - UNIT/Utils.cpp - UNIT/SubTensorInfo.cpp - UNIT/WindowIterator.cpp - UNIT/LifetimeManager.cpp - UNIT/GPUTarget.cpp - CPP/DetectionPostProcessLayer.cpp - CPP/TopKV.cpp - CPP/DFT.cpp - CPP/Permute.cpp - CPP/NonMaximumSuppression.cpp) + PRIVATE ${files_validation_unit} + ${files_validation_cpp} + ) + +file(GLOB_RECURSE files_validation_neon "NEON/*.cpp") +file(GLOB_RECURSE files_validation_runtime "runtime/*.cpp") if(ENABLE_NEON) target_sources( arm_compute_validation - PRIVATE NEON/ElementwiseNegation.cpp - NEON/BoundingBoxTransform.cpp - NEON/ChannelShuffle.cpp - NEON/Logical.cpp - NEON/DilatedConvolutionLayer.cpp - NEON/PoolingLayer.cpp - NEON/BitwiseNot.cpp - NEON/FillBorder.cpp - NEON/ElementwiseRsqrtLayer.cpp - NEON/DepthConcatenateLayer.cpp - NEON/ElementwisePower.cpp - NEON/Fill.cpp - NEON/ROIPoolingLayer.cpp - NEON/LSTMLayer.cpp - NEON/ArithmeticSubtraction.cpp - NEON/GEMMLowp.cpp - NEON/Unstack.cpp - NEON/Slice.cpp - NEON/Pooling3dLayer.cpp - NEON/BitwiseOr.cpp - NEON/HeightConcatenateLayer.cpp - NEON/ReshapeLayer.cpp - NEON/SoftmaxLayer.cpp - NEON/Gather.cpp - NEON/CropResize.cpp - NEON/ReductionOperation.cpp - NEON/PixelWiseMultiplication.cpp - NEON/LogSoftmaxLayer.cpp - NEON/DepthConvertLayer.cpp - NEON/Flatten.cpp - NEON/ElementwiseKernelSelection.cpp - NEON/DepthToSpaceLayer.cpp - NEON/ElementwiseAbsoluteValue.cpp - NEON/PadLayer.cpp - NEON/MeanStdDevNormalizationLayer.cpp - NEON/GlobalPoolingLayer.cpp - NEON/RNNLayer.cpp - NEON/DetectionPostProcessLayer.cpp - NEON/ElementwiseRound.cpp - NEON/BitwiseXor.cpp - NEON/GEMM.cpp - NEON/FuseBatchNormalization.cpp - NEON/BitwiseAnd.cpp - NEON/ElementwiseMax.cpp - NEON/ReduceMean.cpp - NEON/Reverse.cpp - NEON/L2NormalizeLayer.cpp - NEON/Convolution3D.cpp - NEON/ArithmeticAddition.cpp - NEON/ActivationLayer.cpp - NEON/SpaceToBatchLayer.cpp - NEON/ElementwiseLog.cpp - NEON/LSTMLayerQuantized.cpp - NEON/Im2Col.cpp - NEON/DequantizationLayer.cpp - NEON/DeconvolutionLayer.cpp - NEON/Select.cpp - NEON/ElementwiseSin.cpp - NEON/PReluLayer.cpp - NEON/BatchNormalizationLayer.cpp - NEON/ElementwiseMin.cpp - NEON/InstanceNormalizationLayer.cpp - NEON/ROIAlignLayer.cpp - NEON/ElementwiseDivision.cpp - NEON/ElementwiseExpLayer.cpp - NEON/ArgMinMax.cpp - NEON/QLSTMLayerNormalization.cpp - NEON/Col2Im.cpp - NEON/Split.cpp - NEON/Transpose.cpp - NEON/GenerateProposalsLayer.cpp - NEON/StackLayer.cpp - NEON/WidthConcatenateLayer.cpp - NEON/NormalizationLayer.cpp - NEON/Copy.cpp - NEON/ElementwiseSquareDiff.cpp - NEON/MaxUnpoolingLayer.cpp - NEON/Permute.cpp - NEON/Comparisons.cpp - NEON/BatchConcatenateLayer.cpp - NEON/Tile.cpp - NEON/BatchToSpaceLayer.cpp - NEON/SpaceToDepthLayer.cpp - NEON/DepthwiseConvolutionLayerNative.cpp - NEON/QuantizationLayer.cpp - NEON/ConvertFullyConnectedWeights.cpp - NEON/Floor.cpp - NEON/FFT.cpp - NEON/Cast.cpp - NEON/PriorBoxLayer.cpp - NEON/Scale.cpp - NEON/ReorgLayer.cpp - NEON/Range.cpp - NEON/DirectConvolutionLayer.cpp - NEON/DepthwiseConvolutionLayer.cpp - NEON/FullyConnectedLayer.cpp - NEON/ConvolutionLayer.cpp - NEON/StridedSlice.cpp - NEON/ReorderLayer.cpp - NEON/UNIT/DynamicTensor.cpp - NEON/UNIT/TensorAllocator.cpp - NEON/UNIT/MemoryManager.cpp - NEON/UNIT/RuntimeContext.cpp - runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp - runtime/experimental/operators/CpuActivation.cpp - runtime/experimental/operators/CpuAdd.cpp - runtime/experimental/operators/CpuDepthwiseConv2d.cpp - runtime/experimental/operators/CpuElementwise.cpp - runtime/experimental/operators/CpuGemm.cpp - runtime/experimental/operators/CpuGemmConv2d.cpp - runtime/experimental/operators/CpuGemmDirectConv2d.cpp - runtime/experimental/operators/CpuMul.cpp - runtime/experimental/operators/CpuSoftmax.cpp - runtime/experimental/operators/CpuSub.cpp - runtime/experimental/operators/CpuTranspose.cpp - runtime/experimental/operators/CpuWinogradConv2d.cpp - ) + PRIVATE ${files_validation_neon} + ${files_validation_runtime} + ) endif() diff --git a/tests/validation/CPP/LUT.cpp b/tests/validation/CPP/LUT.cpp index 1874823d8d..ab005e3ed6 100644 --- a/tests/validation/CPP/LUT.cpp +++ b/tests/validation/CPP/LUT.cpp @@ -26,6 +26,7 @@ #include "tests/validation/Validation.h" #include "src/core/helpers/LUTManager.h" #include "include/half/half.hpp" +#include "tests/validation/Helpers.h" namespace arm_compute { @@ -85,7 +86,7 @@ TEST_SUITE(BF16) TEST_CASE(LUTValueTest, framework::DatasetMode::ALL) { // Define values for test - constexpr float beta = 1.0f; + constexpr float beta = -1.0f; constexpr float rel_tolerance = 0.01f; constexpr int num_elements = 65536; unsigned int num_mismatches = 0; @@ -97,14 +98,14 @@ TEST_CASE(LUTValueTest, framework::DatasetMode::ALL) if(CPUInfo::get().has_fp16()) { // Retrieve lut, Assert lut exists and is retrieved successfully. - std::shared_ptr lut = lman.get_lut_table(info); + std::shared_ptr lut = lman.get_lut_table(info); ARM_COMPUTE_EXPECT(lut != nullptr, framework::LogLevel::ALL); // Check each value in lut for(int i=0; i < num_elements; i++) { // Calculate reference in fp32. Convert lut value to fp32. - const float fref = std::exp(bf16_to_float(i) * beta * -1); + const float fref = std::exp(bf16_to_float(i) * beta); const uint16_t target_bf16 = read_as_bf16((*lut)[i]); const float target = bf16_to_float(target_bf16); @@ -133,11 +134,19 @@ TEST_CASE(LUTValueTest, framework::DatasetMode::ALL) TEST_CASE(CheckLutReuse, framework::DatasetMode::ALL) { - LUTInfo info = {LUTType::Exponential, 1.0f, DataType::BFLOAT16, UniformQuantizationInfo()}; - LUTManager lman = LUTManager::get_instance(); - auto first = lman.get_lut_table(info); - auto second = lman.get_lut_table(info); - ARM_COMPUTE_EXPECT(first == second, framework::LogLevel::ERRORS); + if (cpu_supports_dtypes({DataType::BFLOAT16})) + { + LUTInfo info = {LUTType::Exponential, -1.0f, DataType::BFLOAT16, UniformQuantizationInfo()}; + LUTManager lman = LUTManager::get_instance(); + auto first = lman.get_lut_table(info); + auto second = lman.get_lut_table(info); + ARM_COMPUTE_EXPECT(first == second, framework::LogLevel::ERRORS); + } + else + { + ARM_COMPUTE_TEST_INFO("Device does not support BFLOAT16 vector operations. Test SKIPPED."); + framework::ARM_COMPUTE_PRINT_INFO(); + } } diff --git a/tests/validation/NEON/Cast.cpp b/tests/validation/NEON/Cast.cpp index 668c60545b..7a4f767175 100644 --- a/tests/validation/NEON/Cast.cpp +++ b/tests/validation/NEON/Cast.cpp @@ -35,14 +35,22 @@ #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Helpers.h" #include "tests/validation/Validation.h" #include "tests/validation/fixtures/CastFixture.h" + +#include +#include + namespace arm_compute { namespace test { namespace validation { + +using framework::dataset::make; + namespace { // Tolerance @@ -56,60 +64,107 @@ constexpr AbsoluteTolerance zero_tolerance(0); /** Input data sets **/ // QASYMM8_SIGNED -const auto CastQASYMM8_SIGNEDtoS16Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::S16)); -const auto CastQASYMM8_SIGNEDtoS32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::S32)); -const auto CastQASYMM8_SIGNEDtoF32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::F32)); -const auto CastQASYMM8_SIGNEDtoF16Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::F16)); +const auto CastQASYMM8_SIGNEDtoS16Dataset = combine(make("DataType", DataType::QASYMM8_SIGNED), make("DataType", DataType::S16)); +const auto CastQASYMM8_SIGNEDtoS32Dataset = combine(make("DataType", DataType::QASYMM8_SIGNED), make("DataType", DataType::S32)); +const auto CastQASYMM8_SIGNEDtoF32Dataset = combine(make("DataType", DataType::QASYMM8_SIGNED), make("DataType", DataType::F32)); +const auto CastQASYMM8_SIGNEDtoF16Dataset = combine(make("DataType", DataType::QASYMM8_SIGNED), make("DataType", DataType::F16)); // QASYMM8 -const auto CastQASYMM8toF16Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::F16)); -const auto CastQASYMM8toF32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::F32)); -const auto CastQASYMM8toS32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::S32)); +const auto CastQASYMM8toF16Dataset = combine(make("DataType", DataType::QASYMM8), make("DataType", DataType::F16)); +const auto CastQASYMM8toF32Dataset = combine(make("DataType", DataType::QASYMM8), make("DataType", DataType::F32)); +const auto CastQASYMM8toS32Dataset = combine(make("DataType", DataType::QASYMM8), make("DataType", DataType::S32)); // U8 -const auto CastU8toU16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U16)); -const auto CastU8toS16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S16)); -const auto CastU8toS32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S32)); -const auto CastU8toF32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::F32)); +const auto CastU8toU16Dataset = combine(make("DataType", DataType::U8), make("DataType", DataType::U16)); +const auto CastU8toS16Dataset = combine(make("DataType", DataType::U8), make("DataType", DataType::S16)); +const auto CastU8toS32Dataset = combine(make("DataType", DataType::U8), make("DataType", DataType::S32)); +const auto CastU8toF32Dataset = combine(make("DataType", DataType::U8), make("DataType", DataType::F32)); // U16 -const auto CastU16toU8Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U8)); -const auto CastU16toU32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U32)); +const auto CastU16toU8Dataset = combine(make("DataType", DataType::U16), make("DataType", DataType::U8)); +const auto CastU16toU32Dataset = combine(make("DataType", DataType::U16), make("DataType", DataType::U32)); // S16 -const auto CastS16toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)); -const auto CastS16toU8Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U8)); -const auto CastS16toS32Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S32)); +const auto CastS16toQASYMM8_SIGNEDDataset = combine(make("DataType", DataType::S16), make("DataType", DataType::QASYMM8_SIGNED)); +const auto CastS16toU8Dataset = combine(make("DataType", DataType::S16), make("DataType", DataType::U8)); +const auto CastS16toS32Dataset = combine(make("DataType", DataType::S16), make("DataType", DataType::S32)); //S32 -const auto CastS32toF16Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F16)); -const auto CastS32toU8Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U8)); -const auto CastS32toF32Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F32)); -const auto CastS32toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::QASYMM8)); -const auto CastS32toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)); +const auto CastS32toF16Dataset = combine(make("DataType", DataType::S32), make("DataType", DataType::F16)); +const auto CastS32toU8Dataset = combine(make("DataType", DataType::S32), make("DataType", DataType::U8)); +const auto CastS32toF32Dataset = combine(make("DataType", DataType::S32), make("DataType", DataType::F32)); +const auto CastS32toQASYMM8Dataset = combine(make("DataType", DataType::S32), make("DataType", DataType::QASYMM8)); +const auto CastS32toQASYMM8_SIGNEDDataset = combine(make("DataType", DataType::S32), make("DataType", DataType::QASYMM8_SIGNED)); // F16 -const auto CastF16toF32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32)); -const auto CastF16toS32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S32)); -const auto CastF16toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::QASYMM8)); -const auto CastF16toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)); +const auto CastF16toF32Dataset = combine(make("DataType", DataType::F16), make("DataType", DataType::F32)); +const auto CastF16toS32Dataset = combine(make("DataType", DataType::F16), make("DataType", DataType::S32)); +const auto CastF16toQASYMM8Dataset = combine(make("DataType", DataType::F16), make("DataType", DataType::QASYMM8)); +const auto CastF16toQASYMM8_SIGNEDDataset = combine(make("DataType", DataType::F16), make("DataType", DataType::QASYMM8_SIGNED)); // F32 -const auto CastF32toU8Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U8)); -const auto CastF32toF16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16)); -const auto CastF32toS32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S32)); -const auto CastF32toQASYMM8Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8)); -const auto CastF32toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)); +const auto CastF32toU8Dataset = combine(make("DataType", DataType::F32), make("DataType", DataType::U8)); +const auto CastF32toF16Dataset = combine(make("DataType", DataType::F32), make("DataType", DataType::F16)); +const auto CastF32toS32Dataset = combine(make("DataType", DataType::F32), make("DataType", DataType::S32)); +const auto CastF32toQASYMM8Dataset = combine(make("DataType", DataType::F32), make("DataType", DataType::QASYMM8)); +const auto CastF32toQASYMM8_SIGNEDDataset = combine(make("DataType", DataType::F32), make("DataType", DataType::QASYMM8_SIGNED)); // U64 -const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32)); +const auto CastU64toF32Dataset = combine(make("DataType", DataType::U64), make("DataType", DataType::F32)); // S64 -const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32)); +const auto CastS64toF32Dataset = combine(make("DataType", DataType::S64), make("DataType", DataType::F32)); + +template +void validate_static_cast(const TensorShape &shape, DataType src_dtype, DataType dst_dtype) +{ + Tensor input = create_tensor(shape, src_dtype, 1); + Tensor output = create_tensor(shape, dst_dtype, 1); + + NECast cast; + cast.configure(&input, &output, ConvertPolicy::SATURATE); + input.allocator()->allocate(); + output.allocator()->allocate(); + + library->fill_tensor_value(Accessor(input), 1.99f); + cast.run(); + + for(unsigned int i = 0; i < shape.x(); ++i) + { + const T ref = 1; + const T target = reinterpret_cast(output.buffer())[i]; + + ARM_COMPUTE_EXPECT(ref == target, framework::LogLevel::ERRORS); + } +} + } // namespace TEST_SUITE(NEON) TEST_SUITE(Cast) + +// Validate casting truncates floats to integer instead of rounding +DATA_TEST_CASE(ValidateStaticCastBehavior, framework::DatasetMode::ALL, + combine( + make("InputDataType", {DataType::F32, DataType::F16}), + make("OutputDataType", {DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8})), + src_dtype, dst_dtype) +{ + const auto shape = TensorShape(18U); // > 16 for channel dim. to stress vector and leftover loops + + if(src_dtype == DataType::F32 || (src_dtype == DataType::F16 && cpu_supports_dtypes({DataType::F16}))) + { + if(dst_dtype == DataType::QASYMM8_SIGNED) + { + validate_static_cast(shape, src_dtype, dst_dtype); + } + else + { + validate_static_cast(shape, src_dtype, dst_dtype); + } + } +} + template using NECastToU8Fixture = CastValidationFixture; template @@ -206,7 +261,7 @@ CAST_SUITE(F32_to_QASYMM8, DataType::F32, DataType::QASYMM8, NECastToQASYMM8Fixt CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, NECastToF16Fixture, CastF32toF16Dataset, zero_tolerance) #endif // ARM_COMPUTE_ENABLE_FP16 CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, NECastToS32Fixture, CastF32toS32Dataset, one_tolerance) -CAST_SUITE(F32_to_U8, DataType::F32, DataType::S32, NECastToS32Fixture, CastF32toS32Dataset, one_tolerance) +CAST_SUITE(F32_to_U8, DataType::F32, DataType::U8, NECastToU8Fixture, CastF32toU8Dataset, one_tolerance) #ifdef __aarch64__ // S64 @@ -217,8 +272,8 @@ CAST_SUITE(U64_to_F32, DataType::U64, DataType::F32, NECastToF32Fixture; using NEGEMMLowpDequantizedMatrixMultiplyValidationFixture = GEMMLowpDequantizedMatrixMultiplyValidationFixture; -using framework::dataset::make; - DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()), shape_a, shape_b, shape_c, a_offset, b_offset) { @@ -368,7 +373,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreForUpdatedStaticQua combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(), make("DataType", { DataType::QASYMM8_SIGNED }), make("reshape_b_only_on_first_run", { false }), - make("updated_sq_info_after_config", { true }))) + make("updated_sq_info_after_config", { true }), + QuantizedActivationFunctionsDataset + )) { validate(Accessor(_target), _reference, tolerance_batched); } @@ -376,7 +383,9 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreForUpdatedStaticQua combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(), make("DataType", { DataType::QASYMM8_SIGNED }), make("reshape_b_only_on_first_run", { false }), - make("updated_sq_info_after_config", { true }))) + make("updated_sq_info_after_config", { true }), + QuantizedActivationFunctionsDataset + )) { validate(Accessor(_target), _reference, tolerance_batched); } @@ -389,7 +398,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreForUpdatedStaticQua combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(), make("DataType", { DataType::QASYMM8 }), make("reshape_b_only_on_first_run", { false }), - make("updated_sq_info_after_config", { true }))) + make("updated_sq_info_after_config", { true }), + QuantizedActivationFunctionsDataset + )) { validate(Accessor(_target), _reference, tolerance_batched); } @@ -397,7 +408,9 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreForUpdatedStaticQua combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(), make("DataType", { DataType::QASYMM8 }), make("reshape_b_only_on_first_run", { false }), - make("updated_sq_info_after_config", { true }))) + make("updated_sq_info_after_config", { true }), + QuantizedActivationFunctionsDataset + )) { validate(Accessor(_target), _reference, tolerance_batched); } diff --git a/tests/validation/NEON/MatMul.cpp b/tests/validation/NEON/MatMul.cpp index ef79faba51..b75b94e32f 100644 --- a/tests/validation/NEON/MatMul.cpp +++ b/tests/validation/NEON/MatMul.cpp @@ -55,6 +55,7 @@ constexpr AbsoluteTolerance tolerance_qasymm8_signed(1); // clang-format off // *INDENT-OFF* // Validation Tests +#ifdef __aarch64__ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip( make("InputAInfo", { @@ -108,6 +109,61 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, CpuMatMulSettings()); ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); } +#else // __aarch64__ +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, + zip( + make("InputAInfo", { + TensorInfo(TensorShape(9U, 6U), 1, DataType::F32), // Mismatching datatype + TensorInfo(TensorShape(9U, 6U), 1, DataType::S32), // Unsupported datatypes + TensorInfo(TensorShape(9U, 6U, 2U), 1, DataType::F32), // Broadcasting in batch dimension not supported + TensorInfo(TensorShape(9U, 6U), 1, DataType::F32), // Invalid shape for multiplication + TensorInfo(TensorShape(9U, 6U), 1, DataType::F32), + TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32), + TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32), // Tensors are not dynamic + TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8), + TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED), + TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED), // Mismatching data type + }), + make("InputBInfo", { + TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8), + TensorInfo(TensorShape(5U, 9U), 1, DataType::S32), + TensorInfo(TensorShape(5U, 9U, 1U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 12U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 9U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8), // MatMul of Qauntized Datatypes Not supported on armv7a + TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED), + TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED), + }), + make("OutputInfo", { + TensorInfo(TensorShape(5U, 6U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 6U), 1, DataType::S32), + TensorInfo(TensorShape(5U, 6U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 6U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 6U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32), + TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32), + TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8), + TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8_SIGNED), + TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8), + }), + make("TensorIsConst", {false, false, false, false, false , false, true, false, false, false}), + make("Expected", { false, false, false, false, true, true, false, false, false, false })), + a_info, b_info, output_info, are_tensors_const, expected) +{ + TensorInfo a{a_info}; + TensorInfo b{b_info}; + a.set_are_values_constant(are_tensors_const); + b.set_are_values_constant(are_tensors_const); + Status status = NEMatMul::validate(&a, + &b, + &output_info, + MatMulInfo(), + CpuMatMulSettings()); + ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); +} +#endif // __aarch64__ // *INDENT-ON* // clang-format on diff --git a/tests/validation/NEON/Permute.cpp b/tests/validation/NEON/Permute.cpp index e9939105cd..5c51c7c032 100644 --- a/tests/validation/NEON/Permute.cpp +++ b/tests/validation/NEON/Permute.cpp @@ -31,6 +31,7 @@ #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Helpers.h" #include "tests/validation/Validation.h" #include "tests/validation/fixtures/PermuteFixture.h" @@ -179,14 +180,30 @@ TEST_SUITE(F16) FIXTURE_DATA_TEST_CASE(RunSmall, NEPermuteFixture, framework::DatasetMode::PRECOMMIT, PermuteParametersSmall * framework::dataset::make("DataType", DataType::F16)) { - // Validate output - validate(Accessor(_target), _reference); + if (cpu_supports_dtypes({DataType::F16})) + { + // Validate output + validate(Accessor(_target), _reference); + } + else + { + ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED."); + framework::ARM_COMPUTE_PRINT_INFO(); + } } FIXTURE_DATA_TEST_CASE(RunLarge, NEPermuteFixture, framework::DatasetMode::NIGHTLY, PermuteParametersLarge * framework::dataset::make("DataType", DataType::F16)) { - // Validate output - validate(Accessor(_target), _reference); + if (cpu_supports_dtypes({DataType::F16})) + { + // Validate output + validate(Accessor(_target), _reference); + } + else + { + ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED."); + framework::ARM_COMPUTE_PRINT_INFO(); + } } TEST_SUITE_END() #endif /* ARM_COMPUTE_ENABLE_FP16 */ diff --git a/tests/validation/NEON/QuantizationLayer.cpp b/tests/validation/NEON/QuantizationLayer.cpp index fac5d73abd..da057c4c1f 100644 --- a/tests/validation/NEON/QuantizationLayer.cpp +++ b/tests/validation/NEON/QuantizationLayer.cpp @@ -34,6 +34,7 @@ #include "tests/validation/Validation.h" #include "tests/validation/fixtures/QuantizationLayerFixture.h" +#include namespace arm_compute { @@ -44,8 +45,11 @@ namespace validation namespace { /** Tolerance for quantization */ +/// @note: We do not expect any difference between our reference and target implementations for UInt8 and Int8 constexpr AbsoluteTolerance tolerance_u8(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8 data types */ constexpr AbsoluteTolerance tolerance_s8(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8_SIGNED data types */ +constexpr AbsoluteTolerance zero_tolerance_s8(0); + constexpr AbsoluteTolerance tolerance_u16(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM16 data types */ const auto QuantizationSmallShapes = concat(datasets::Small3DShapes(), datasets::Small4DShapes()); const auto QuantizationLargeShapes = concat(datasets::Large3DShapes(), datasets::Large4DShapes()); @@ -54,6 +58,38 @@ const auto QuantizationLargeShapes = concat(datasets: TEST_SUITE(NEON) TEST_SUITE(QuantizationLayer) +TEST_CASE(ProperlyRoundedRequantization, framework::DatasetMode::ALL) +{ + // The test case here covers both Int8 and UInt8 because the underlying kernel is the same + const auto shape = TensorShape(18U); // > 16 for channel dim. to stress vector and leftover loops + const auto dtype = DataType::QASYMM8_SIGNED; + const auto in_qinfo = QuantizationInfo(0.5f, -1); + const auto out_qinfo = QuantizationInfo(1.f, -1); + + Tensor input = create_tensor(shape, dtype, 1, in_qinfo); + Tensor output = create_tensor(shape, dtype, 1, out_qinfo); + + NEQuantizationLayer quant_layer; + quant_layer.configure(&input, &output); + + input.allocator()->allocate(); + output.allocator()->allocate(); + + std::vector values = {1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35}; + std::vector expected = {0,1,2,3,4,5 ,6 ,7 ,8 ,9 ,10,11,12,13,14,15,16,17}; // (x + 1)/2 - 1 + + SimpleTensor ref {shape, dtype, 1, out_qinfo}; + + ARM_COMPUTE_EXPECT(values.size() == shape.x(), framework::LogLevel::ERRORS); + + library->fill_static_values(Accessor(input), values); + library->fill_static_values(ref, expected); + + quant_layer.run(); + + validate(Accessor(output), ref, zero_tolerance_s8); +} + // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip( diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp index e5692693bd..05d09369c2 100644 --- a/tests/validation/NEON/ReduceMean.cpp +++ b/tests/validation/NEON/ReduceMean.cpp @@ -34,6 +34,10 @@ #include "tests/validation/Validation.h" #include "tests/validation/fixtures/ReduceMeanFixture.h" +#include +#include +#include + namespace arm_compute { namespace test @@ -54,6 +58,9 @@ constexpr AbsoluteTolerance tolerance_u8(2); /**< Tolerance value fo constexpr AbsoluteTolerance tolerance_s8(2); /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */ #endif // __aarch64__ +constexpr AbsoluteTolerance zero_tolerance_u8(0); +constexpr AbsoluteTolerance zero_tolerance_s8(0); + const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(2, 3), Coordinates(0, 1, 2, 3) }), framework::dataset::make("KeepDims", { true })); const auto axis_drop = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1), Coordinates(3) }), framework::dataset::make("KeepDims", { false })); @@ -61,6 +68,87 @@ const auto axis_drop = combine(framework::dataset::make("Axis", { Coordinates(0) TEST_SUITE(NEON) TEST_SUITE(ReduceMean) +TEST_CASE(ProperRoundingPolicyXReduction, framework::DatasetMode::ALL) +{ + // We do not need to stress vector and leftover loops diffrently + // because the rounding is done scalarly at the end. Accumulation + // is done over integer types. + constexpr int x_len = 2; + + const auto input_shape = TensorShape(x_len); + const auto output_shape = TensorShape(1); + const bool keep_dims = true; + const auto axis = Coordinates(0); + const auto input_qinfo = QuantizationInfo(2 / 255.f, 0); + const auto output_qinfo = QuantizationInfo(6 / 255.f, -1); + const auto dtype = DataType::QASYMM8_SIGNED; + + Tensor input = create_tensor(input_shape, dtype, 1, input_qinfo); + Tensor output = create_tensor(output_shape, dtype, 1, output_qinfo); + + NEReduceMean reduce_mean; + reduce_mean.configure(&input, axis, keep_dims, &output); + + input.allocator()->allocate(); + output.allocator()->allocate(); + + std::vector values {50, 26}; + library->fill_static_values(Accessor(input), values); + + std::vector expected {12}; + SimpleTensor ref{ output_shape, dtype, 1, input_qinfo }; + library->fill_static_values(ref, expected); + + reduce_mean.run(); + + // The tolerance should be 0 because this test stresses the rounding behavior of the operator + validate(Accessor(output), ref, zero_tolerance_s8); +} + +#ifdef __aarch64__ +// Due to the lack of instructions in a32, the rounding operation is less +// accurate +TEST_CASE(ProperRoundingPolicyNonXReduction, framework::DatasetMode::ALL) +{ + constexpr int x_len = 17; // > 16 to stress both vector and leftover loops + + const auto input_shape = TensorShape(x_len, 2, 2, 1); + const auto output_shape = TensorShape(x_len, 1, 1, 1); + const bool keep_dims = true; + const auto axis = Coordinates(1, 2); + const auto input_qinfo = QuantizationInfo(2 / 255.f, 127); + const auto output_qinfo = QuantizationInfo(2 / 255.f, 127); + const auto dtype = DataType::QASYMM8; + + Tensor input = create_tensor(input_shape, dtype, 1, input_qinfo); + Tensor output = create_tensor(output_shape, dtype, 1, output_qinfo); + + NEReduceMean reduce_mean; + reduce_mean.configure(&input, axis, keep_dims, &output); + + input.allocator()->allocate(); + output.allocator()->allocate(); + + // {139, 139 ... 139 (x_len times) 154, 154, ... 154 (x_len_times) ...} + std::vector values; + fill_n(back_inserter(values), x_len, 139); + fill_n(back_inserter(values), x_len, 154); + fill_n(back_inserter(values), x_len, 164); + fill_n(back_inserter(values), x_len, 179); + library->fill_static_values(Accessor(input), values); + + std::vector expected; + fill_n(back_inserter(expected), x_len, 159); // 159 = (139 + 154 + 164 + 179) / 4 + SimpleTensor ref{ output_shape, dtype, 1, input_qinfo }; + library->fill_static_values(ref, expected); + + reduce_mean.run(); + + // The tolerance should be 0 because this test stresses the rounding behavior of the operator + validate(Accessor(output), ref, zero_tolerance_u8); +} +#endif // __aarch64__ + // *INDENT-OFF* // clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp index 55de2d6281..82e6ceaa71 100644 --- a/tests/validation/NEON/Scale.cpp +++ b/tests/validation/NEON/Scale.cpp @@ -28,6 +28,7 @@ #include "tests/framework/Macros.h" #include "tests/validation/Validation.h" #include "tests/validation/fixtures/ScaleFixture.h" +#include "utils/TypePrinter.h" namespace arm_compute { @@ -165,9 +166,23 @@ TEST_CASE(SupportDataType, framework::DatasetMode::ALL) { const auto input = TensorInfo{ input_shape, 1, kv.first, default_data_layout }; const auto output = TensorInfo{ output_shape, 1, kv.first, default_data_layout }; - - result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false }); - ARM_COMPUTE_EXPECT(bool(result) == kv.second, framework::LogLevel::ERRORS); + if(cpu_supports_dtypes({kv.first})) + { + result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false }); + ARM_COMPUTE_EXPECT_EQUAL(bool(result) , kv.second, framework::LogLevel::ERRORS); + if(bool(result) != kv.second) + { + std::string fail_reason = "For " + to_string(kv.first) + " validate() returns " + to_string(bool(result)) + " but expected answer is " + to_string(kv.second); + ARM_COMPUTE_TEST_INFO(fail_reason); + framework::ARM_COMPUTE_PRINT_INFO(); + } + } + else + { + std::string skip_reason = "Skip supported datatype test because device does not support " + to_string(kv.first) + " vector operations."; + ARM_COMPUTE_TEST_INFO(skip_reason.c_str()); + framework::ARM_COMPUTE_PRINT_INFO(); + } } } diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp index e428d7958b..c8c3f0bb49 100644 --- a/tests/validation/NEON/SoftmaxLayer.cpp +++ b/tests/validation/NEON/SoftmaxLayer.cpp @@ -145,7 +145,7 @@ DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, cpu_isa.fp16 = (data_type == DataType::F16); const auto *selected_impl = CpuSoftmaxKernel::get_implementation( - SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */, 0 /* axis */, CPUInfo::get().get_sme2_vector_length()}, + SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */, 0 /* axis */, CPUInfo::get().get_sme2_vector_length_in_bits()}, cpu::KernelSelectionType::Preferred); ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl); @@ -232,6 +232,29 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture, framework::Dataset } TEST_SUITE_END() //FP16 #endif /* ARM_COMPUTE_ENABLE_FP16 */ +#ifdef ARM_COMPUTE_ENABLE_BF16 +constexpr AbsoluteTolerance tolerance_bf16{0.02f}; +TEST_SUITE(BF16) +FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, + combine( + datasets::SmallShapes(), + make("DataType", DataType::BFLOAT16), + make("Beta", { 1.0f, 2.0f }), + make("Axis", { 0 }))) +{ + if(CPUInfo::get().has_bf16()) + { + // Validate output + validate(Accessor(_target), _reference, tolerance_bf16); + } + else + { + ARM_COMPUTE_TEST_INFO("Device does not support bf16 vector operations. Test SKIPPED."); + framework::ARM_COMPUTE_PRINT_INFO(); + } +} +TEST_SUITE_END() //BF16 +#endif /* ARM_COMPUTE_ENABLE_BF16 */ TEST_SUITE(FP32) FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture, framework::DatasetMode::PRECOMMIT, diff --git a/tests/validation/NEON/UNIT/TensorAllocator.cpp b/tests/validation/NEON/UNIT/TensorAllocator.cpp index 0aab9ef9b5..f2863552e2 100644 --- a/tests/validation/NEON/UNIT/TensorAllocator.cpp +++ b/tests/validation/NEON/UNIT/TensorAllocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -65,8 +65,14 @@ TEST_CASE(ImportMemory, framework::DatasetMode::ALL) ARM_COMPUTE_ASSERT(t1.info()->is_resizable()); // Negative case : Import misaligned pointer - Tensor t2; - const size_t required_alignment = 339; + Tensor t2; + size_t required_alignment = 339; + ARM_COMPUTE_ASSERT(data.get() != nullptr); + // If the data ptr is aligned with 339, keep adding 1 until it is misaligned. + while (arm_compute::utility::check_aligned(data.get(), required_alignment)) + { + required_alignment += 1; + } t2.allocator()->init(info, required_alignment); ARM_COMPUTE_ASSERT(!bool(t2.allocator()->import_memory(data.get()))); ARM_COMPUTE_ASSERT(t2.info()->is_resizable()); diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h index 289aca4d08..4c07134c35 100644 --- a/tests/validation/Validation.h +++ b/tests/validation/Validation.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2022, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_VALIDATION_H -#define ARM_COMPUTE_TEST_VALIDATION_H +#ifndef ACL_TESTS_VALIDATION_VALIDATION_H +#define ACL_TESTS_VALIDATION_VALIDATION_H #include "arm_compute/core/IArray.h" #include "arm_compute/core/Types.h" @@ -54,6 +54,14 @@ inline bool are_equal_infs(T val0, T val1) const auto same_sign = support::cpp11::signbit(val0) == support::cpp11::signbit(val1); return (!support::cpp11::isfinite(val0)) && (!support::cpp11::isfinite(val1)) && same_sign; } + +#ifdef ARM_COMPUTE_ENABLE_FP16 +template <> +inline bool are_equal_infs(float16_t val0, float16_t val1) +{ + return are_equal_infs(static_cast(val0), static_cast(val1)); +} +#endif /* ARM_COMPUTE_ENABLE_FP16 */ } // namespace /** Class reprensenting an absolute tolerance value. */ @@ -689,4 +697,4 @@ void validate_min_max_loc(const MinMaxLocationValues &target, const MinMaxLoc } // namespace validation } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_REFERENCE_VALIDATION_H */ +#endif // ACL_TESTS_VALIDATION_VALIDATION_H diff --git a/tests/validation/fixtures/CastFixture.h b/tests/validation/fixtures/CastFixture.h index 8297ec81dc..432df69b41 100644 --- a/tests/validation/fixtures/CastFixture.h +++ b/tests/validation/fixtures/CastFixture.h @@ -65,6 +65,10 @@ class CastValidationFixture : public framework::Fixture case DataType::U8: case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: + case DataType::QSYMM16: + case DataType::QASYMM16: case DataType::S8: case DataType::F32: { @@ -113,9 +117,13 @@ class CastValidationFixture : public framework::Fixture TensorType compute_target(const TensorShape &shape, DataType dt_in, DataType dt_out, ConvertPolicy policy) { + // These are necessary but not used qinfo for creating tensor buffer for QSYMM8_PER_CHANNEL + QuantizationInfo src_not_used_qinfo(0.25f, 2); + QuantizationInfo dst_not_used_qinfo(0.5f, 2); + // Create tensors - TensorType src = create_tensor(shape, dt_in, 1); - TensorType dst = create_tensor(shape, dt_out, 1); + TensorType src = create_tensor(shape, dt_in, 1, src_not_used_qinfo); + TensorType dst = create_tensor(shape, dt_out, 1, dst_not_used_qinfo); // Create and configure function FunctionType cast; diff --git a/tests/validation/fixtures/CpuDequantizeFixture.h b/tests/validation/fixtures/CpuDequantizeFixture.h new file mode 100644 index 0000000000..06352818fc --- /dev/null +++ b/tests/validation/fixtures/CpuDequantizeFixture.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_TESTS_VALIDATION_FIXTURES_CPUDEQUANTIZEFIXTURE_H +#define ACL_TESTS_VALIDATION_FIXTURES_CPUDEQUANTIZEFIXTURE_H + + +#include "tests/validation/fixtures/DequantizationLayerFixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +template +class CpuDequantizationValidationFixture : public DequantizationValidationFixture +{ +public: + void setup(TensorShape shape, DataType src_data_type, DataType dst_datatype, DataLayout data_layout) + { + if(!cpu_supports_dtypes({src_data_type, dst_datatype})){ + return; + } + + this->_quantization_info = this->generate_quantization_info(src_data_type, shape.z()); + this->_target = this->compute_target(shape, src_data_type, dst_datatype, data_layout); + this->_reference = this->compute_reference(shape, src_data_type); + } + +protected: + TensorType compute_target(TensorShape shape, DataType src_data_type, DataType dst_datatype, DataLayout data_layout) + { + if(data_layout == DataLayout::NHWC) + { + permute(shape, PermutationVector(2U, 0U, 1U)); + } + + // Create tensors + TensorType src = create_tensor(shape, src_data_type, 1, this->_quantization_info, data_layout); + TensorType dst = create_tensor(shape, dst_datatype, 1, QuantizationInfo(), data_layout); + + // Create and configure function + FunctionType dequantization_layer; + dequantization_layer.configure(src.info(), dst.info()); + + ARM_COMPUTE_ASSERT(src.info()->is_resizable()); + ARM_COMPUTE_ASSERT(dst.info()->is_resizable()); + + // Allocate tensors + src.allocator()->allocate(); + dst.allocator()->allocate(); + + ARM_COMPUTE_ASSERT(!src.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!dst.info()->is_resizable()); + + // Fill tensors + this->fill(AccessorType(src)); + + // Prepare tensor pack + ITensorPack run_pack = { { arm_compute::TensorType::ACL_SRC, &src }, + { arm_compute::TensorType::ACL_DST, &dst } }; + + // Compute function + dequantization_layer.run(run_pack); + + return dst; + } + +}; +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif // ACL_TESTS_VALIDATION_FIXTURES_CPUDEQUANTIZEFIXTURE_H diff --git a/tests/validation/fixtures/CpuGEMMLowpFixture.h b/tests/validation/fixtures/CpuGEMMLowpFixture.h new file mode 100644 index 0000000000..91083ea0cf --- /dev/null +++ b/tests/validation/fixtures/CpuGEMMLowpFixture.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2017-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_TESTS_VALIDATION_FIXTURES_CPUGEMMLOWPFIXTURE_H +#define ACL_TESTS_VALIDATION_FIXTURES_CPUGEMMLOWPFIXTURE_H + +#include "tests/validation/fixtures/GEMMLowpFixture.h" + +#include + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ + +namespace { +template +TensorType compute_cpugemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, + const QuantizationInfo& output_qinfo, DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8, + GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(), + bool accumulate = false, bool dynamic_qinfo = false, DataType data_type_output = DataType::UNKNOWN) +{ + ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a)); + // If unknown, set to sensible defaults + if (data_type_output == DataType::UNKNOWN) { + data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a; + } + + // Create tensors + TensorType a = create_tensor(shape_a, data_type_a, 1, dynamic_qinfo ? QuantizationInfo(1.0,0,true) : a_qinfo); + TensorType b = create_tensor(shape_b, data_type_b, 1, dynamic_qinfo ? QuantizationInfo(1.0,0,true) : b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated + TensorType output = create_tensor(shape_output, data_type_output, 1, output_qinfo /* output_qinfo will be ignored when output stage type is None */); + + TensorType bias; + if(is_fused) + { + TensorShape bias_shape(shape_b[0]); + bias = create_tensor(bias_shape,data_type_output == DataType::F32 ? DataType::F32 : DataType::S32, 1); + } + + // Create and configure function + // The GEMMinfo includes the values of the depth in case of reinterpreted 3d input/output + FunctionType gemmlowp; + gemmlowp.configure(a.info(), b.info(), is_fused ? bias.info() : nullptr, output.info(), GEMMInfo(false, false, reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false, + output_stage, false /*fp_mixed_precision*/, false /*fast_math*/, false /*broadcast_bias*/, + arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED, + false /* pretranspose_B */, accumulate)); + + // If the QuantizationInfo is dynamic, it needs to be settable after configure (note that we also force it to be dynamic) + if (dynamic_qinfo) + { + a.info()->set_quantization_info(QuantizationInfo(a_qinfo.scale(), a_qinfo.offset(), true)); + b.info()->set_quantization_info(QuantizationInfo(b_qinfo.scale(), b_qinfo.offset(), true)); + } + + ARM_COMPUTE_ASSERT(a.info()->is_resizable()); + ARM_COMPUTE_ASSERT(b.info()->is_resizable()); + ARM_COMPUTE_ASSERT(output.info()->is_resizable()); + + add_padding_x({ &a, &b, &output }); + + // Allocate tensors + a.allocator()->allocate(); + b.allocator()->allocate(); + output.allocator()->allocate(); + + ARM_COMPUTE_ASSERT(!a.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!b.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!output.info()->is_resizable()); + + // telhs are newly created every call of this lambda function + ITensorPack pack = + { + { arm_compute::TensorType::ACL_SRC_0, &a }, + { arm_compute::TensorType::ACL_SRC_1, &b }, + { arm_compute::TensorType::ACL_DST, &output } + }; + + // Fill tensors + fill_quantized(AccessorType(a), 0 + finfo.hash); + fill_quantized(AccessorType(b), 1 + finfo.hash); + + if (accumulate) + { + ARM_COMPUTE_ASSERT(accumulate != run_twice); + fill(AccessorType(output), 6 + finfo.hash, finfo.min_output, finfo.max_output); + } + + if(is_fused) + { + ARM_COMPUTE_ASSERT(bias.info()->is_resizable()); + bias.allocator()->allocate(); + ARM_COMPUTE_ASSERT(!bias.info()->is_resizable()); + fill(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias); + pack.add_tensor(arm_compute::TensorType::ACL_SRC_2, &bias); + } + + auto mg = MemoryGroup{}; + auto ws = manage_workspace(gemmlowp.workspace(), mg, pack, pack); + allocate_tensors(gemmlowp.workspace(), ws); + + // Run with variable inputs. + if(run_twice) + { + gemmlowp.run(pack); + fill_quantized(AccessorType(a), 3 + finfo.hash); // Fill tensors with new seed after run + fill_quantized(AccessorType(b), 4 + finfo.hash); + if(is_fused) + { + fill(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias); + } + } + + // Compute GEMM function + gemmlowp.run(pack); + return output; +} +} // namespace + +template +class CpuGEMMLowpMatrixMultiplyCoreValidationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture +{ +public: + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset) + { + const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset); + const auto b_qinfo = QuantizationInfo(2.0f / 255, b_offset); + TensorFillInfo finfo; + + bool accumulate = false; + bool dynamic_qinfo = false; + this->_target = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo); + this->_reference = this->compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate); + } + +protected: + TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo) + { + const auto output_qinfo = QuantizationInfo(); // No output stage + return compute_cpugemmlowp_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, DataType::QASYMM8, DataType::QASYMM8, GEMMLowpOutputStageInfo(), false, finfo, accumulate, dynamic_qinfo); + } +}; + +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif // ACL_TESTS_VALIDATION_FIXTURES_CPUGEMMLOWPFIXTURE_H diff --git a/tests/validation/fixtures/CpuGemmAssemblyDispatchFixture.h b/tests/validation/fixtures/CpuGemmAssemblyDispatchFixture.h index fc070eb7a0..5d74e210d5 100644 --- a/tests/validation/fixtures/CpuGemmAssemblyDispatchFixture.h +++ b/tests/validation/fixtures/CpuGemmAssemblyDispatchFixture.h @@ -24,11 +24,15 @@ #ifndef ACL_TESTS_VALIDATION_FIXTURES_CPUGEMMASSEMBLYDISPATCHFIXTURE_H #define ACL_TESTS_VALIDATION_FIXTURES_CPUGEMMASSEMBLYDISPATCHFIXTURE_H +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h" +#include "arm_compute/runtime/NEON/functions/NETranspose.h" + #include "src/core/NEON/kernels/arm_gemm/utils.hpp" #include "tests/framework/Asserts.h" #include "tests/framework/Fixture.h" +#include "tests/validation/reference/ActivationLayer.h" #include "tests/validation/reference/GEMM.h" -#include "arm_compute/core/Helpers.h" namespace arm_compute { @@ -40,19 +44,27 @@ template ::value && // Cpu + data_type == DataType::F16 && !CPUInfo::get().has_fp16()) + { + return; + } ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(beta); - _target = compute_target(shape_a, shape_b, shape_c, output_shape, data_type, accumulate); - _reference = compute_reference(shape_a, shape_b, output_shape, data_type, accumulate); + _target = + compute_target(shape_a, shape_b, shape_c, output_shape, data_type, accumulate, pretranspose_b, act_info); + _reference = compute_reference(shape_a, shape_b, output_shape, data_type, accumulate, act_info); } protected: @@ -78,48 +90,56 @@ class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixtur } } - TensorType compute_target(const TensorShape &shape_a, - const TensorShape &shape_b, - const TensorShape &shape_c, - const TensorShape &output_shape, - DataType data_type, - bool accumulate) + TensorType compute_target(const TensorShape &shape_a, + const TensorShape &shape_b, + const TensorShape &shape_c, + const TensorShape &output_shape, + DataType data_type, + bool accumulate, + bool pretranspose_b, + ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(shape_c); // Create tensors - TensorType a = create_tensor(shape_a, data_type, 1); - TensorType b = create_tensor(shape_b, data_type, 1); - TensorType *c = nullptr; - TensorType dst = create_tensor(output_shape, data_type, 1); + TensorType a = create_tensor(shape_a, data_type, 1); + TensorType b = create_tensor(shape_b, data_type, 1); + TensorType b_transposed = create_tensor({shape_b[1], shape_b[0]}, data_type, 1); + TensorType *c = nullptr; + TensorType dst = create_tensor(output_shape, data_type, 1); // Create and configure function FunctionType gemm; + NETranspose transpose; - add_padding_x({&a, &b, &dst}); + add_padding_x({&a, &b, &b_transposed, &dst}); GEMMInfo gemm_info; gemm_info.set_accumulate(accumulate); + gemm_info.set_pretranspose_B(pretranspose_b); + gemm_info.set_activation_info(act_info); + + TensorType &b_to_use = pretranspose_b ? b_transposed : b; - ARM_COMPUTE_ASSERT(gemm.validate(a.info(), b.info(), nullptr, dst.info(), gemm_info)); + ARM_COMPUTE_ASSERT(gemm.validate(a.info(), b_to_use.info(), nullptr, dst.info(), gemm_info)); - // The GEMMinfo includes the values of the depth in case of reinterpreted 3d output. - // If the output shape has the same number of dimensions of the input the method called is a 2D matrix multiplication (depth_output_reinterpreted_as_3D = 0), - // in the other case we have to use the reinterpreted version of GEMM (depth_output_reinterpreted_as_3D = depth of the 3D output). - gemm.configure(a.info(), b.info(), nullptr, dst.info(), gemm_info); + gemm.configure(a.info(), b_to_use.info(), nullptr, dst.info(), gemm_info); ARM_COMPUTE_ASSERT(gemm.is_configured()); ARM_COMPUTE_ASSERT(a.info()->is_resizable()); ARM_COMPUTE_ASSERT(b.info()->is_resizable()); + ARM_COMPUTE_ASSERT(b_transposed.info()->is_resizable()); ARM_COMPUTE_ASSERT(dst.info()->is_resizable()); // Allocate tensors a.allocator()->allocate(); b.allocator()->allocate(); + b_transposed.allocator()->allocate(); dst.allocator()->allocate(); ARM_COMPUTE_ASSERT(!a.info()->is_resizable()); ARM_COMPUTE_ASSERT(!b.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!b_transposed.info()->is_resizable()); ARM_COMPUTE_ASSERT(!dst.info()->is_resizable()); // Fill tensors @@ -130,13 +150,19 @@ class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixtur fill(AccessorType(dst), 6); }; + if (pretranspose_b) + { + transpose.configure(&b, &b_transposed); + transpose.run(); + } + ITensorPack run_pack{{arm_compute::TensorType::ACL_SRC_0, &a}, - {arm_compute::TensorType::ACL_SRC_1, &b}, + {arm_compute::TensorType::ACL_SRC_1, &b_to_use}, {arm_compute::TensorType::ACL_SRC_2, c}, {arm_compute::TensorType::ACL_DST_0, &dst}}; // Prepare memory - ITensorPack prep_pack{{arm_compute::TensorType::ACL_SRC_1, &b}, {arm_compute::TensorType::ACL_SRC_2, c}}; + ITensorPack prep_pack{{arm_compute::TensorType::ACL_SRC_1, &b_to_use}, {arm_compute::TensorType::ACL_SRC_2, c}}; experimental::MemoryRequirements aux_mem_req = gemm.workspace(); MemoryGroup memory_group{}; @@ -157,7 +183,7 @@ class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixtur } else { - run_pack.add_const_tensor(ACL_SRC_1, &b); + run_pack.add_const_tensor(ACL_SRC_1, &b_to_use); } // Release temporary tensors that are only used in prepare stage @@ -169,15 +195,17 @@ class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixtur a.allocator()->free(); b.allocator()->free(); + b_transposed.allocator()->free(); return dst; } - SimpleTensor compute_reference(const TensorShape &shape_a, - const TensorShape &shape_b, - const TensorShape &output_shape, - DataType data_type, - bool accumulate) + SimpleTensor compute_reference(const TensorShape &shape_a, + const TensorShape &shape_b, + const TensorShape &output_shape, + DataType data_type, + bool accumulate, + ActivationLayerInfo act_info) { // Create reference SimpleTensor a{shape_a, data_type, 1}; @@ -196,28 +224,52 @@ class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixtur fill(dst, 6); } - // Setting beta to 0 will effectively disable C for the - // computation of the reference: A * B + 0 * C - // Use transposed tensors if boolean enabled else use original tensors if (accumulate) { reference::gemm_accumulate(a, b, c, 1.0f, 0.f, dst); - return dst; } else { - return reference::gemm(a, b, c, 1.f, 0.f); + dst = reference::gemm(a, b, c, 1.f, 0.f); } + + if (act_info.enabled()) + { + return reference::activation_layer(dst, act_info); + } + return dst; } TensorType _target{}; SimpleTensor _reference{}; }; -template +template class CpuGemmAssemblyDispatchValidationFixture : protected CpuGemmAssemblyDispatchGenericValidationFixture { +public: + void setup(TensorShape shape_a, + TensorShape shape_b, + TensorShape shape_c, + TensorShape output_shape, + float alpha, + float beta, + DataType data_type, + bool accumulate, + bool pretranspose_b, + ActivationLayerInfo act_info) + { + CpuGemmAssemblyDispatchGenericValidationFixture::setup( + shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type, accumulate, pretranspose_b, act_info); + } +}; + +#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS +template +class CpuGemmAssemblyDispatchFixedFormatFixture + : protected CpuGemmAssemblyDispatchGenericValidationFixture +{ public: void setup(TensorShape shape_a, TensorShape shape_b, @@ -227,11 +279,159 @@ class CpuGemmAssemblyDispatchValidationFixture float beta, DataType data_type) { - CpuGemmAssemblyDispatchGenericValidationFixture::setup( - shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type, accumulate); + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(beta); + this->_target = compute_target(shape_a, shape_b, shape_c, output_shape, data_type); + this->_reference = + this->compute_reference(shape_a, shape_b, output_shape, data_type, false, ActivationLayerInfo()); } + +protected: + inline TensorInfo prepare_weights(const TensorInfo tensor_info, const arm_compute::WeightFormat weight_format) + { + const DataLayout data_layout = tensor_info.data_layout(); + const DataType data_type = tensor_info.data_type(); + const TensorShape tensor_shape = tensor_info.tensor_shape(); + const int N = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)]; // N=O + const int H = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)]; + const int W = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)]; + const int C = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)]; // C=I + + const int interleave_by = arm_compute::interleave_by(weight_format); + const int block_by = arm_compute::block_by(weight_format); + const int Ip = arm_gemm::roundup(C, block_by); // C'=I' + const int Op = arm_gemm::roundup(N, interleave_by); // O'=N' + + arm_compute::Strides strides_in_bytes = tensor_info.strides_in_bytes(); + strides_in_bytes.set(1, Ip * interleave_by * W * tensor_info.element_size()); + strides_in_bytes.set(2, Op * interleave_by * W * tensor_info.element_size()); + + const size_t offset_first_element_in_bytes = tensor_info.offset_first_element_in_bytes(); + + // Total size needs to include padded dimensions + const size_t total_size_in_bytes = Op * H * W * Ip * tensor_info.element_size(); + + const TensorShape TS({tensor_shape[0], arm_compute::ceil_to_multiple(tensor_shape[1], 4)}); + + TensorInfo new_tensor_info = tensor_info; + new_tensor_info.set_data_layout(DataLayout::UNKNOWN); + new_tensor_info.init(TS, tensor_info.num_channels(), data_type, strides_in_bytes, offset_first_element_in_bytes, + total_size_in_bytes); + return new_tensor_info; + } + + TensorType compute_target( + TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, DataType data_type) + { + ARM_COMPUTE_UNUSED(shape_c); + permute(shape_b, PermutationVector(1U, 0U)); + // Create tensors + TensorType a = create_tensor(shape_a, data_type, 1, QuantizationInfo(), DataLayout::NCHW); + TensorType b = create_tensor(shape_b, data_type, 1, QuantizationInfo(), DataLayout::NCHW); + TensorType c = nullptr; + TensorType dst = create_tensor(output_shape, data_type, 1, QuantizationInfo(), DataLayout::NCHW); + + // Create and configure function + FunctionType gemm; + NEReorderLayer reorder; + arm_compute::WeightFormat computed_weight_format{arm_compute::WeightFormat::ANY}; + GEMMInfo gemm_info; + + gemm_info.set_fixed_format(true); + gemm_info.set_accumulate(false); + gemm_info.set_weight_format(computed_weight_format); + + const bool kernel_found = bool( + FunctionType::has_opt_impl(computed_weight_format, a.info(), b.info(), nullptr, dst.info(), gemm_info)); + + ARM_COMPUTE_ASSERT(kernel_found); + gemm_info.set_weight_format(computed_weight_format); + gemm_info.set_fast_math(is_fixed_format_fast_math(computed_weight_format)); + TensorType b_transformed = create_tensor(prepare_weights(*b.info(), computed_weight_format)); + + a.info()->set_are_values_constant(false); + b_transformed.info()->set_are_values_constant(false); + + ARM_COMPUTE_ASSERT(a.info()->is_resizable()); + ARM_COMPUTE_ASSERT(b.info()->is_resizable()); + ARM_COMPUTE_ASSERT(b_transformed.info()->is_resizable()); + ARM_COMPUTE_ASSERT(dst.info()->is_resizable()); + + // Allocate tensors + a.allocator()->allocate(); + b.allocator()->allocate(); + b_transformed.allocator()->allocate(); + dst.allocator()->allocate(); + + ARM_COMPUTE_ASSERT(!a.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!b.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!b_transformed.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!dst.info()->is_resizable()); + + // Fill tensors + this->fill(AccessorType(a), 0, -1.f, 1.f); + this->fill(AccessorType(b), 1, -1.f, 1.f); + + // Reorder weight to the expected format + reorder.configure(&b, &b_transformed, WeightFormat::OHWI, computed_weight_format); + reorder.run(); + + ARM_COMPUTE_ASSERT(gemm.validate(a.info(), b_transformed.info(), nullptr, dst.info(), gemm_info)); + gemm.configure(a.info(), b_transformed.info(), nullptr, dst.info(), gemm_info); + ARM_COMPUTE_ASSERT(gemm.is_configured()); + + ITensorPack run_pack; + run_pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_0, &a); + run_pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_1, &b_transformed); + run_pack.add_tensor(arm_compute::TensorType::ACL_SRC_2, &c); + run_pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst); + + // Prepare memory + ITensorPack prep_pack{{arm_compute::TensorType::ACL_SRC_1, &b_transformed}, + {arm_compute::TensorType::ACL_SRC_2, &c}}; + + experimental::MemoryRequirements aux_mem_req = gemm.workspace(); + MemoryGroup memory_group{}; + + WorkspaceData workspace = manage_workspace(aux_mem_req, memory_group, run_pack, prep_pack); + + gemm.prepare(prep_pack); + MemoryGroupResourceScope scope_mg(memory_group); + + auto has_reshape = std::find_if(aux_mem_req.begin(), aux_mem_req.end(), + [](const arm_compute::experimental::MemoryInfo &m) -> bool { + return m.lifetime == arm_compute::experimental::MemoryLifetime::Persistent; + }); + + if (has_reshape != std::end(aux_mem_req)) + { + b_transformed.mark_as_unused(); + } + else + { + run_pack.add_const_tensor(ACL_SRC_1, &b_transformed); + } + + // Release temporary tensors that are only used in prepare stage + release_temporaries(aux_mem_req, workspace); + // End of preparing + + gemm.run(run_pack); + + a.allocator()->free(); + b.allocator()->free(); + b_transformed.allocator()->free(); + + return dst; + } + + TensorType _target{}; + SimpleTensor _reference{}; + bool _kernel_found{false}; }; +#endif //ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS + } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/fixtures/CpuGemmConv2dFixture.h b/tests/validation/fixtures/CpuGemmConv2dFixture.h index c8e82fb8a0..67ba4e74db 100644 --- a/tests/validation/fixtures/CpuGemmConv2dFixture.h +++ b/tests/validation/fixtures/CpuGemmConv2dFixture.h @@ -158,6 +158,183 @@ class CpuGemmConv2dValidationFixture : public framework::Fixture DataLayout _data_layout{DataLayout::NHWC}; }; +template +class CpuGemmConv2dStaticQuantValidationFixture : public ConvolutionValidationGenericFixture +{ +public: + void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, + DataType data_type, DataType weights_data_type, DataLayout data_layout, QuantizationInfo quantization_info, QuantizationInfo weight_quantization_info, ActivationLayerInfo act_info) + { + ARM_COMPUTE_ASSERT(data_type == DataType::QASYMM8_SIGNED || data_type == DataType::QASYMM8); + + // This hash is used by random generators. There may be hash collisions but + // this is intentional as it's a very easy way to make the the current + // random generation process almost different for many test configurations, + // which were using the same set of values before. + this->_hash = input_shape[0] + input_shape[1] + input_shape[2] + input_shape[3] + + + weights_shape[0] + weights_shape[1] + weights_shape[2] + weights_shape[3] + + (data_type == DataType::QASYMM8_SIGNED) + (data_layout == DataLayout::NHWC); + + this->_data_type = data_type; + this->_weights_data_type = weights_data_type; + this->_bias_data_type = DataType::S32; + this->_output_data_type = data_type; + this->_quantization_info = quantization_info; + this->_weight_quantization_info = weight_quantization_info; + this->_data_layout = data_layout; + this->_dst_q_info = quantization_info; + + if(!is_data_type_quantized_symmetric(weights_data_type) && (!act_info.enabled() || act_info.activation() == ActivationFunction::IDENTITY)) + { + this->setup_quantization(input_shape, weights_shape, this->_quantization_info, this->_weight_quantization_info, data_type); + this->_use_dynamic_output_quant = true; + } + + this->_target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info); + + this->_reference = this->compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info); + } + +protected: + + // Compute the target when updating static quantization information after configuration for the stateless api. + TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &info, + bool reshape_weights, const Size2D &dilation, const ActivationLayerInfo act_info, PaddingList pre_pad_layer = PaddingList({}), bool padded_weights = false) + { + ARM_COMPUTE_ASSERT((std::is_same::value == true)); + + ARM_COMPUTE_ERROR_ON((input_shape[2] % weights_shape[2]) != 0); + + const unsigned int num_groups = input_shape[2] / weights_shape[2]; + + if(this->_data_layout == DataLayout::NHWC) + { + permute(input_shape, PermutationVector(2U, 0U, 1U)); + permute(weights_shape, PermutationVector(2U, 0U, 1U)); + permute(output_shape, PermutationVector(2U, 0U, 1U)); + + if(pre_pad_layer.size() > 0) + { + // make sure paddings exist for each c,h,w dimensions + for(unsigned int i = 0; i < 3 - pre_pad_layer.size(); ++i) + { + pre_pad_layer.push_back({ 0, 0 }); + } + + // rotate padding info from nchw to nhwc + std::rotate(pre_pad_layer.begin(), pre_pad_layer.begin() + 2, pre_pad_layer.begin() + 3); + } + } + + const int idx_width = get_data_layout_dimension_index(this->_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(this->_data_layout, DataLayoutDimension::HEIGHT); + + WeightsInfo weights_info(!reshape_weights, weights_shape[idx_width], weights_shape[idx_height], weights_shape[3]); + TensorShape reshaped_weights_shape(weights_shape); + + // Create tensors with fake quantization info and defer to pass the correct ones to a later stage. + auto qi = QuantizationInfo(0.550721, 37, true); + TensorType src = create_tensor(input_shape, this->_data_type, 1, qi, this->_data_layout); + TensorType weights = create_tensor(reshaped_weights_shape, this->_weights_data_type, 1, qi, this->_data_layout); + TensorType dst = create_tensor(output_shape, this->_output_data_type, 1, qi, this->_data_layout); + TensorType bias = create_tensor(bias_shape, this->_bias_data_type, 1, QuantizationInfo() /*bias is not a quantized type*/, this->_data_layout); + + // Create and configure function + FunctionType conv; + + const unsigned int height_index = arm_compute::graph::get_dimension_idx(this->_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int width_index = arm_compute::graph::get_dimension_idx(this->_data_layout, DataLayoutDimension::WIDTH); + + const PaddingInfo pad_w = width_index < pre_pad_layer.size() ? pre_pad_layer[width_index] : PaddingInfo(0, 0); + const PaddingInfo pad_h = height_index < pre_pad_layer.size() ? pre_pad_layer[height_index] : PaddingInfo(0, 0); + + if(pre_pad_layer.size() > 0 && arm_compute::graph::is_padding_in_height_or_width(this->_data_layout, pre_pad_layer)) + { + // this is the logic implemented in NodeFusionMutator -> fuse_pad_with_convolution + const PadStrideInfo new_conv_info( + info.stride().first, + info.stride().second, + info.pad_left() + pad_w.first, + info.pad_right() + pad_w.second, + info.pad_top() + pad_h.first, + info.pad_bottom() + pad_h.second, + info.round()); + + conv.configure(src.info(), weights.info(), bias.info(), dst.info(), new_conv_info, weights_info, dilation, act_info, false /* enable_fast_math */, num_groups); + auto const status = conv.validate(src.info(), weights.info(), bias.info(), dst.info(), new_conv_info); + ARM_COMPUTE_ASSERT(status); + } + else + { + conv.configure(src.info(), weights.info(), bias.info(), dst.info(), info, weights_info, dilation, act_info, false /* enable_fast_math */, num_groups); + auto const status = conv.validate(src.info(), weights.info(), bias.info(), dst.info(), info); + ARM_COMPUTE_ASSERT(status); + } + + // After calling configure, we appropriately set the correct quantization info and update ACL. + src.info()->set_quantization_info(QuantizationInfo(this->_quantization_info.scale(), this->_quantization_info.offset(), true)); + weights.info()->set_quantization_info(QuantizationInfo(this->_weight_quantization_info.scale(), this->_weight_quantization_info.offset(), true)); + dst.info()->set_quantization_info(QuantizationInfo(this->_dst_q_info.scale(), this->_dst_q_info.offset(), true)); + + ARM_COMPUTE_ASSERT(src.info()->is_resizable()); + ARM_COMPUTE_ASSERT(weights.info()->is_resizable()); + ARM_COMPUTE_ASSERT(bias.info()->is_resizable()); + ARM_COMPUTE_ASSERT(dst.info()->is_resizable()); + + // Test "add padding after configure" behavior. This behavior should not affect the correctness + add_padding_x({ &src, &bias, &dst }, this->_data_layout); + // Padding weights may affect code path in some backends + if (padded_weights) + { + add_padding_x({ &weights }, this->_data_layout); + } + + // // Allocate tensors + src.allocator()->allocate(); + weights.allocator()->allocate(); + bias.allocator()->allocate(); + dst.allocator()->allocate(); + + ITensorPack run_pack{ + {ACL_SRC_0, &src}, {ACL_SRC_1, &weights}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}}; + ITensorPack prep_pack{{ACL_SRC_1, &weights}, {ACL_SRC_2, &bias}}; + + // propagate trough ACL the correct quantization info + conv.update_quantization_parameters(run_pack); + + auto mg = MemoryGroup{}; + auto ws = manage_workspace(conv.workspace(), mg, run_pack, prep_pack); + + ARM_COMPUTE_ASSERT(!src.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!weights.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!bias.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!dst.info()->is_resizable()); + + // Fill tensors + this->fill(AccessorType(src), 0 + this->_hash); + this->fill(AccessorType(weights), 1 + this->_hash); + this->fill(AccessorType(bias), 2 + this->_hash); + + // Compute Convolution function + conv.prepare(prep_pack); + conv.run(run_pack); + + return dst; + } +}; + +template +class CpuGemmConv2dForUpdatedStaticQuantInfoAfterConfigureFixture : public CpuGemmConv2dStaticQuantValidationFixture +{ +public: + void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type, + DataLayout data_layout, QuantizationInfo quantization_info, ActivationLayerInfo act_info) + { + CpuGemmConv2dStaticQuantValidationFixture::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights, + data_type, data_type, data_layout, quantization_info, quantization_info, act_info); + } +}; + } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/fixtures/CpuQuantizeFixture.h b/tests/validation/fixtures/CpuQuantizeFixture.h new file mode 100644 index 0000000000..fb2be16339 --- /dev/null +++ b/tests/validation/fixtures/CpuQuantizeFixture.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_TESTS_VALIDATION_FIXTURES_CPUQUANTIZEFIXTURE_H +#define ACL_TESTS_VALIDATION_FIXTURES_CPUQUANTIZEFIXTURE_H + +#include "tests/validation/fixtures/QuantizationLayerFixture.h" +#include "tests/validation/Helpers.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "tests/validation/Helpers.h" +namespace arm_compute +{ +namespace test +{ +namespace validation +{ + +template +class CpuQuantizationValidationFixture : public QuantizationValidationFixture +{ +public: +void setup(TensorShape shape, DataType data_type_in, DataType data_type_out, QuantizationInfo qinfo) +{ + QuantizationInfo qinfo_in; + if(std::is_same::value && // Cpu + (data_type_in == DataType::F16 || data_type_out == DataType::F16) && !CPUInfo::get().has_fp16()) + { + return; + } + + if(!cpu_supports_dtypes({data_type_in, data_type_out})) { + return; + } + + this->_target = compute_target(shape, data_type_in, data_type_out, qinfo, qinfo_in); + this->_reference = this->compute_reference(shape, data_type_in, data_type_out, qinfo, qinfo_in); +} + +protected: + TensorType compute_target(const TensorShape &shape, DataType data_type_in, DataType data_type_out, QuantizationInfo qinfo, QuantizationInfo qinfo_in) + { + // Create tensors + TensorType src = create_tensor(shape, data_type_in, 1, qinfo_in); + TensorType dst = create_tensor(shape, data_type_out, 1, qinfo); + + // Create and configure function + FunctionType quantization_layer; + quantization_layer.configure(src.info(), dst.info()); + + ARM_COMPUTE_ASSERT(src.info()->is_resizable()); + ARM_COMPUTE_ASSERT(dst.info()->is_resizable()); + + // Allocate tensors + src.allocator()->allocate(); + dst.allocator()->allocate(); + + ARM_COMPUTE_ASSERT(!src.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!dst.info()->is_resizable()); + + // Fill tensors + this->fill(AccessorType(src)); + + // Prepare tensor pack + ITensorPack run_pack = { { arm_compute::TensorType::ACL_SRC, &src }, + { arm_compute::TensorType::ACL_DST, &dst } }; + auto mg = MemoryGroup{}; + auto ws = arm_compute::manage_workspace(quantization_layer.workspace(), mg, run_pack); + allocate_tensors(quantization_layer.workspace(), ws); + + // Compute function + quantization_layer.run(run_pack); + + return dst; + } +}; + + +} // namespace validation +} // namespace test +} // namespace arm_compute + + +#endif // ACL_TESTS_VALIDATION_FIXTURES_CPUQUANTIZEFIXTURE_H diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h index 854442b174..e572f41ec4 100644 --- a/tests/validation/fixtures/GEMMLowpFixture.h +++ b/tests/validation/fixtures/GEMMLowpFixture.h @@ -95,7 +95,7 @@ template ::value == true)); ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a)); @@ -125,7 +125,7 @@ TensorType compute_gemmlowp_target_for_updated_sq_info_after_config(const Tensor gemmlowp.configure(&a, &b, is_fused ? &bias : nullptr, &output, GEMMInfo(false, false, reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false, output_stage, false /*fp_mixed_precision*/, false /*fast_math*/, false /*broadcast_bias*/, - arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED, + act_info, false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED, false /* pretranspose_B */, accumulate)); ARM_COMPUTE_ASSERT(a.info()->is_resizable()); @@ -216,7 +216,7 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape FunctionType gemmlowp; gemmlowp.configure(&a, &b, is_fused ? &bias : nullptr, &output, GEMMInfo(false, false, reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false, output_stage, false /*fp_mixed_precision*/, false /*fast_math*/, false /*broadcast_bias*/, - arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED, + ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED, false /* pretranspose_B */, accumulate)); // If the QuantizationInfo is dynamic, it needs to be settable after configure (note that we also force it to be dynamic) @@ -467,9 +467,15 @@ class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : publ const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const QuantizationInfo& output_qinfo, + const ActivationLayerInfo& act_info, GEMMLowpOutputStageType type, GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) { + // Supported activations in GEMM + const std::set supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_quantized_asymmetric(data_type)); const UniformQuantizationInfo aq_unif = a_qinfo.uniform(); @@ -485,7 +491,14 @@ class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : publ int32_t type_min = 0; int32_t type_max = 0; - std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(output_qinfo, ActivationLayerInfo(), data_type); + + if (supported_acts.find(act_info.activation()) != supported_acts.end()) + { + std::tie(type_min, type_max) = + arm_compute::get_quantized_activation_min_max(act_info, data_type, oq_unif); + } else { + std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(output_qinfo, ActivationLayerInfo(), data_type); + } gemmlowp_output_stage_info.gemmlowp_real_multiplier = multiplier; gemmlowp_output_stage_info.gemmlowp_multiplier = int_multiplier; @@ -507,7 +520,7 @@ class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : publ * */ void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type, - bool reshape_b_only_on_first_run, bool updated_sq_info_after_config = false) + bool reshape_b_only_on_first_run, bool updated_sq_info_after_config = false, const ActivationLayerInfo& act_info = ActivationLayerInfo()) { ARM_COMPUTE_ASSERT(output_stage_type != GEMMLowpOutputStageType::NONE); ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type)); @@ -521,20 +534,20 @@ class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : publ setup_quantization(data_type, shape_a, shape_b, a_qinfo, b_qinfo, output_qinfo, finfo); GEMMLowpOutputStageInfo output_stage; - init_gemmlowp_output_stage_info(data_type, a_qinfo, b_qinfo, output_qinfo, output_stage_type, output_stage); + init_gemmlowp_output_stage_info(data_type, a_qinfo, b_qinfo, output_qinfo, act_info, output_stage_type, output_stage); _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type, data_type, output_stage, finfo); - _target = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, data_type, data_type, output_stage, reshape_b_only_on_first_run, finfo, updated_sq_info_after_config); + _target = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, data_type, data_type, output_stage, reshape_b_only_on_first_run, finfo, updated_sq_info_after_config, act_info); } protected: TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const QuantizationInfo& output_qinfo, - DataType data_type_a, DataType data_type_b, const GEMMLowpOutputStageInfo& output_stage, bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(), bool updated_sq_info_after_config = false) + DataType data_type_a, DataType data_type_b, const GEMMLowpOutputStageInfo& output_stage, bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(), bool updated_sq_info_after_config = false, const ActivationLayerInfo& act_info = ActivationLayerInfo()) { if (updated_sq_info_after_config) { return compute_gemmlowp_target_for_updated_sq_info_after_config(shape_a, shape_b, shape_output, a_qinfo, - b_qinfo, output_qinfo, data_type_a, data_type_b, output_stage, reshape_b_only_on_first_run, finfo); + b_qinfo, output_qinfo, data_type_a, data_type_b, output_stage, reshape_b_only_on_first_run, finfo, false, arm_compute::DataType::UNKNOWN, act_info); } else { diff --git a/tests/validation/fixtures/PermuteFixture.h b/tests/validation/fixtures/PermuteFixture.h index b1b3845a8d..7dacf5181a 100644 --- a/tests/validation/fixtures/PermuteFixture.h +++ b/tests/validation/fixtures/PermuteFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_PERMUTE_FIXTURE -#define ARM_COMPUTE_TEST_PERMUTE_FIXTURE +#ifndef ACL_TESTS_VALIDATION_FIXTURES_PERMUTEFIXTURE_H +#define ACL_TESTS_VALIDATION_FIXTURES_PERMUTEFIXTURE_H #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorShape.h" @@ -33,6 +33,7 @@ #include "tests/IAccessor.h" #include "tests/framework/Asserts.h" #include "tests/framework/Fixture.h" +#include "tests/validation/Helpers.h" #include "tests/validation/reference/Permute.h" namespace arm_compute @@ -47,6 +48,12 @@ class PermuteValidationFixture : public framework::Fixture public: void setup(TensorShape input_shape, PermutationVector perm, DataType data_type) { + if (std::is_same::value && // CPU + !cpu_supports_dtypes({data_type})) + { + return; + } + _target = compute_target(input_shape, data_type, perm); _reference = compute_reference(input_shape, data_type, perm); } @@ -108,4 +115,4 @@ class PermuteValidationFixture : public framework::Fixture } // namespace validation } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_PERMUTE_FIXTURE */ +#endif // ACL_TESTS_VALIDATION_FIXTURES_PERMUTEFIXTURE_H diff --git a/tests/validation/fixtures/SoftmaxLayerFixture.h b/tests/validation/fixtures/SoftmaxLayerFixture.h index 399a8b70c4..582018f2f1 100644 --- a/tests/validation/fixtures/SoftmaxLayerFixture.h +++ b/tests/validation/fixtures/SoftmaxLayerFixture.h @@ -73,8 +73,9 @@ class SoftmaxValidationGenericFixture : public framework::Fixture { arm_compute::utils::uniform_real_distribution_16bit distribution{ -10.0f, 10.0f }; library->fill(tensor, distribution, 0); - } - else if(!is_data_type_quantized(tensor.data_type())) + }else if(tensor.data_type() == DataType::BFLOAT16){ + library->fill_tensor_uniform(tensor, 0); + }else if(!is_data_type_quantized(tensor.data_type())) { std::uniform_int_distribution<> distribution(0, 100); library->fill(tensor, distribution, 0); diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp index 30c577d850..0eb1f46848 100644 --- a/tests/validation/reference/GEMMLowp.cpp +++ b/tests/validation/reference/GEMMLowp.cpp @@ -130,14 +130,17 @@ void quantize_down_scale_by_fixedpoint(const SimpleTensor *in, const Simple } result += result_offset_after_shift; - // Bounded ReLu - if(min != max) + result = std::max(min, std::min(max, result)); + + if (min == std::numeric_limits::min() && max == std::numeric_limits::max()) { - result = std::max(min, std::min(max, result)); + (*dst)[i] = static_cast(std::max(std::numeric_limits::lowest(), + std::min(std::numeric_limits::max(), result))); + } + else + { + (*dst)[i] = static_cast(result); } - - (*dst)[i] = static_cast(std::max(std::numeric_limits::lowest(), - std::min(std::numeric_limits::max(), result))); } } diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp index 3fbac32a9b..08c091b4a9 100644 --- a/tests/validation/reference/SoftmaxLayer.cpp +++ b/tests/validation/reference/SoftmaxLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -103,19 +103,97 @@ SimpleTensor softmax_layer_generic(const SimpleTensor &src, float beta, in return dst; } +template ::value, int>::type> +SimpleTensor softmax_layer_bfloat16(const SimpleTensor &src, float beta, int32_t axis, bool is_log) +{ + // Create reference + SimpleTensor dst{ src.shape(), src.data_type(), 1 }; + + const int32_t n_dims = static_cast(src.shape().num_dimensions()); + ARM_COMPUTE_ERROR_ON(axis < -n_dims || axis >= n_dims); + + const unsigned int actual_axis = static_cast(wrap_around(axis, n_dims)); + Window window; + window.use_tensor_dimensions(src.shape()); + const unsigned int axis_dimension = src.shape()[actual_axis]; + window.set(actual_axis, Window::Dimension(0, 1, 1)); + + execute_window_loop(window, [&](const Coordinates & id) + { + // Find max along axis + Coordinates offset(id); + offset.set(actual_axis, 0); + T max = *reinterpret_cast(src(offset)); + float max_f = float(max); + for(unsigned int axis_id = 1; axis_id < axis_dimension; ++axis_id) + { + offset.set(actual_axis, axis_id); + const T val = *reinterpret_cast(src(offset)); + float val_f = float(val); + + if(val_f > max_f) + { + max_f = val_f; + } + } + + // Regularize + float sum(0.f); + for(unsigned int axis_id = 0; axis_id < axis_dimension; ++axis_id) + { + offset.set(actual_axis, axis_id); + const T val = *reinterpret_cast(src(offset)); + + float val_f = float(val); + float beta_f = float(beta); + float res{ (val_f - max_f) *beta_f }; + if(is_log) + { + sum += std::exp(res); + } + else + { + res = std::exp(res); + sum += res; + } + *reinterpret_cast(dst(offset)) = res; + } + + // Normalize + for(unsigned int axis_id = 0; axis_id < axis_dimension; ++axis_id) + { + offset.set(actual_axis, axis_id); + const T val = *reinterpret_cast(dst(offset)); + float val_f = float(val); + if(is_log) + { + *reinterpret_cast(dst(offset)) = val - static_cast(std::log(sum)); + } + else + { + *reinterpret_cast(dst(offset)) = (val_f / sum); + } + } + }); + return dst; +} + template SimpleTensor softmax_layer_generic(const SimpleTensor &src, float beta, int32_t axis, bool is_log); template SimpleTensor softmax_layer_generic(const SimpleTensor &src, float beta, int32_t axis, bool is_log); template ::value, int>::type> SimpleTensor softmax_layer(const SimpleTensor &src, float beta, int32_t axis, bool is_log) { - return softmax_layer_generic(src, beta, axis, is_log); + if(std::is_same::value){ + return softmax_layer_bfloat16(src, beta, axis, is_log); + }else{ + return softmax_layer_generic(src, beta, axis, is_log); + } } template < typename T, typename std::enable_if < std::is_same::value || std::is_same::value, int >::type > SimpleTensor softmax_layer(const SimpleTensor &src, float beta, int32_t axis, bool is_log) -{ - const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log); +{ const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log); SimpleTensor src_tmp = convert_from_asymmetric(src); SimpleTensor dst_tmp = softmax_layer(src_tmp, beta, axis, is_log); @@ -127,6 +205,7 @@ template SimpleTensor softmax_layer(const SimpleTensor &src, float template SimpleTensor softmax_layer(const SimpleTensor &src, float beta, int32_t axis, bool is_log); template SimpleTensor softmax_layer(const SimpleTensor &src, float beta, int32_t axis, bool is_log); template SimpleTensor softmax_layer(const SimpleTensor &src, float beta, int32_t axis, bool is_log); +template SimpleTensor softmax_layer(const SimpleTensor &src, float beta, int32_t axis, bool is_log); } // namespace reference } // namespace validation diff --git a/tests/validation/reference/SoftmaxLayer.h b/tests/validation/reference/SoftmaxLayer.h index 3362f195c9..609d10b3ac 100644 --- a/tests/validation/reference/SoftmaxLayer.h +++ b/tests/validation/reference/SoftmaxLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2020, 2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_SOFTMAX_LAYER_H -#define ARM_COMPUTE_TEST_SOFTMAX_LAYER_H +#ifndef ACL_TESTS_VALIDATION_REFERENCE_SOFTMAXLAYER_H +#define ACL_TESTS_VALIDATION_REFERENCE_SOFTMAXLAYER_H #include "tests/SimpleTensor.h" #include "tests/validation/Helpers.h" @@ -38,6 +38,9 @@ namespace reference template ::value, int>::type = 0> SimpleTensor softmax_layer_generic(const SimpleTensor &src, float beta, int32_t axis, bool is_log = false); +template ::value, int>::type = 0> +SimpleTensor softmax_layer_bfloat16(const SimpleTensor &src, float beta, int32_t axis, bool is_log = false); + template ::value, int>::type = 0> SimpleTensor softmax_layer(const SimpleTensor &src, float beta, int32_t axis = 0, bool is_log = false); @@ -47,4 +50,4 @@ SimpleTensor softmax_layer(const SimpleTensor &src, float beta, int32_t ax } // namespace validation } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_SOFTMAX_LAYER_H */ +#endif // ACL_TESTS_VALIDATION_REFERENCE_SOFTMAXLAYER_H diff --git a/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp b/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp index 613ec24bff..a5f3323160 100644 --- a/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp +++ b/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp @@ -24,6 +24,8 @@ #include "arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h" #include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" +#include "tests/datasets/DatatypeDataset.h" #include "tests/datasets/LargeGEMMDataset.h" #include "tests/datasets/SmallGEMMDataset.h" #include "tests/framework/Asserts.h" @@ -137,7 +139,8 @@ TEST_CASE(MemoryInjection, framework::DatasetMode::ALL) auto result_1 = run_conv(); for (size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i) { - ARM_COMPUTE_EXPECT(reinterpret_cast(result_0.buffer())[i] == reinterpret_cast(result_1.buffer())[i], + ARM_COMPUTE_EXPECT(reinterpret_cast(result_0.buffer())[i] == + reinterpret_cast(result_1.buffer())[i], framework::LogLevel::ERRORS); } } @@ -192,111 +195,216 @@ TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL) auto result_1 = run_conv(); for (size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i) { - ARM_COMPUTE_EXPECT((reinterpret_cast(result_0.buffer()))[i] == (reinterpret_cast(result_1.buffer()))[i], + ARM_COMPUTE_EXPECT((reinterpret_cast(result_0.buffer()))[i] == + (reinterpret_cast(result_1.buffer()))[i], framework::LogLevel::ERRORS); }; } // *INDENT-OFF* // clang-format off -DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip( - make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type - TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), - }), - make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32), - TensorInfo(TensorShape(8U, 27U), 1, DataType::F32), - }), - make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32), - TensorInfo(TensorShape(8U, 13U), 1, DataType::F32), - }), - make("Expected", { false, true })), - lhs_info, rhs_info, output_info, expected) +DATA_TEST_CASE(ValidateAllDataTypes, + framework::DatasetMode::ALL, + combine( + datasets::AllDataTypes("DataType"), + datasets::AllDataTypes("DataType"), + datasets::AllDataTypes("DataType"), + make("fixed_format", {true, false})), + lhs_data_type, rhs_data_type, output_data_type, fixed_format) { - const auto gemm_info = GEMMInfo(); + auto gemm_info = GEMMInfo(); + auto asm_info = arm_compute::cpu::AsmGemmInfo(); + auto lhs_info = TensorInfo(TensorShape(21U, 13U), 1, lhs_data_type); + auto rhs_info = TensorInfo(TensorShape(33U, 21U), 1, rhs_data_type); + auto output_info = TensorInfo(TensorShape(33U, 13U), 1, output_data_type); + gemm_info.set_fixed_format(fixed_format); + asm_info.fixed_format = fixed_format; + + if (fixed_format) { + WeightFormat wf = WeightFormat::ANY; + gemm_info.set_accumulate(false); + asm_info.accumulate = false; + gemm_info.set_weight_format(wf); + asm_info.weight_format = wf; + gemm_info.set_fast_math(rhs_data_type == DataType::BFLOAT16 && fixed_format); + asm_info.fast_mode = rhs_data_type == DataType::BFLOAT16 && fixed_format; + + experimental::op::ll::CpuGemmAssemblyDispatch::has_opt_impl(wf, &lhs_info, &rhs_info, nullptr, &output_info, gemm_info); + gemm_info.set_weight_format(wf); + asm_info.weight_format = wf; + rhs_info.set_data_layout(DataLayout::NCHW); + } + + const auto supports = { + std::make_tuple(DataType::F32, DataType::F32, DataType::F32), + std::make_tuple(DataType::F16, DataType::F16, DataType::F16), + std::make_tuple(DataType::BFLOAT16, DataType::BFLOAT16, DataType::BFLOAT16), + std::make_tuple(DataType::BFLOAT16, DataType::BFLOAT16, DataType::F32), + std::make_tuple(DataType::F32, DataType::BFLOAT16, DataType::F32), + }; + const auto config = std::make_tuple(lhs_data_type, rhs_data_type, output_data_type); + + bool expected = arm_compute::cpu::CpuGemmAssemblyDispatch::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), asm_info) && + (std::find(supports.begin(), supports.end(), config) != supports.end()); + bool is_valid = bool(experimental::op::ll::CpuGemmAssemblyDispatch::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), gemm_info)); ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); } template -using CpuGemmAssemblyDispatchFixture = CpuGemmAssemblyDispatchValidationFixture; +using CpuGemmAssemblyDispatchFixture = CpuGemmAssemblyDispatchValidationFixture; + +#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS template -using CpuGemmAccumulateFixture = CpuGemmAssemblyDispatchValidationFixture; +using CpuGemmFixedFormatFixture = CpuGemmAssemblyDispatchFixedFormatFixture; +#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS TEST_SUITE(Float) -DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine( - zip(make("In0",{ TensorShape(21U, 13U) }), - make("In1", { TensorShape(33U, 21U) }), - make("Dst", { TensorShape(33U, 13U) })), - zip( - make("is_c_null", { false, false, false, true }), - make("Expected", { true, true, true, true }))), - shape_a, shape_b, shape_dst, is_c_null, expected) +DATA_TEST_CASE(ValidateAccumulate, + framework::DatasetMode::ALL, + combine( + make("In0",{ TensorShape(21U, 13U) }), + make("In1", { TensorShape(33U, 21U) }), + make("Dst", { TensorShape(33U, 13U) }), + make("Expected", { true })), + shape_a, shape_b, shape_dst, expected) { - ARM_COMPUTE_UNUSED(is_c_null); /* Accumulation test for GEMM kernels */ // Create tensors TensorInfo in_a(shape_a, 1, DataType::F32); TensorInfo in_b(shape_b, 1, DataType::F32); - TensorInfo in_c(shape_dst, 1, DataType::F32); TensorInfo dst(shape_dst, 1, DataType::F32); GEMMInfo gemm_info = GEMMInfo(); gemm_info.set_accumulate(true); // Validate accumulation - Status status = experimental::op::ll::CpuGemmAssemblyDispatch::validate(&in_a, &in_b, (is_c_null ? nullptr : &in_c), &dst, gemm_info); + Status status = experimental::op::ll::CpuGemmAssemblyDispatch::validate(&in_a, &in_b, nullptr, &dst, gemm_info); ARM_COMPUTE_EXPECT((expected == bool(status)), framework::LogLevel::ERRORS); } #ifdef ARM_COMPUTE_ENABLE_FP16 TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmAssemblyDispatchFixture, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallGEMMDataset(), - make("DataType", DataType::F16))) +FIXTURE_DATA_TEST_CASE(RunSmall, + CpuGemmAssemblyDispatchFixture, + framework::DatasetMode::PRECOMMIT, + combine(datasets::SmallGEMMDataset(), + make("DataType", DataType::F16), + make("Accumulate", false), + make("Pretranspose_B", {false, true}), + make("ActivationInfo", { + ActivationLayerInfo(), + ActivationLayerInfo(ActivationFunction::RELU), + ActivationLayerInfo(ActivationFunction::BOUNDED_RELU, 1.f), + ActivationLayerInfo(ActivationFunction::LU_BOUNDED_RELU, 1.f) + }))) { - // Validate output - validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16); + if(CPUInfo::get().has_fp16()) + { + // Validate output + validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16); + } + else + { + ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED."); + framework::ARM_COMPUTE_PRINT_INFO(); + } } -FIXTURE_DATA_TEST_CASE(RunLarge, CpuGemmAssemblyDispatchFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMDataset(), - make("DataType", DataType::F16))) +FIXTURE_DATA_TEST_CASE(RunLarge, + CpuGemmAssemblyDispatchFixture, + framework::DatasetMode::NIGHTLY, + combine(datasets::LargeGEMMDataset(), + make("DataType", DataType::F16), + make("Accumulate", false), + make("Pretranspose_B", {false, true}), + make("ActivationInfo", { + ActivationLayerInfo(), + ActivationLayerInfo(ActivationFunction::RELU), + ActivationLayerInfo(ActivationFunction::BOUNDED_RELU, 1.f), + ActivationLayerInfo(ActivationFunction::LU_BOUNDED_RELU, 1.f) + }))) { - // Validate output - validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16); + if(CPUInfo::get().has_fp16()) + { + // Validate output + validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16); + } + else + { + ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED."); + framework::ARM_COMPUTE_PRINT_INFO(); + } } - TEST_SUITE_END() // FP16 #endif /* ARM_COMPUTE_ENABLE_FP16 */ TEST_SUITE(FP32) -FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmAssemblyDispatchFixture, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallGEMMDataset(), - make("DataType", DataType::F32))) +FIXTURE_DATA_TEST_CASE(RunSmall, + CpuGemmAssemblyDispatchFixture, + framework::DatasetMode::PRECOMMIT, + combine(datasets::SmallGEMMDataset(), + make("DataType", DataType::F32), + make("Accumulate", {false, true}), + make("Pretranspose_B", {false, true}), + make("ActivationInfo", { + ActivationLayerInfo(), + ActivationLayerInfo(ActivationFunction::RELU), + ActivationLayerInfo(ActivationFunction::BOUNDED_RELU, 1.f), + ActivationLayerInfo(ActivationFunction::LU_BOUNDED_RELU, 1.f) + }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f); } -FIXTURE_DATA_TEST_CASE(RunLarge, CpuGemmAssemblyDispatchFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMDataset(), - make("DataType", DataType::F32))) +FIXTURE_DATA_TEST_CASE(RunLarge, + CpuGemmAssemblyDispatchFixture, + framework::DatasetMode::NIGHTLY, + combine(datasets::LargeGEMMDataset(), + make("DataType", DataType::F32), + make("Accumulate", {false, true}), + make("Pretranspose_B", {false, true}), + make("ActivationInfo", { + ActivationLayerInfo(), + ActivationLayerInfo(ActivationFunction::RELU), + ActivationLayerInfo(ActivationFunction::BOUNDED_RELU, 1.f), + ActivationLayerInfo(ActivationFunction::LU_BOUNDED_RELU, 1.f) + }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f); } -TEST_SUITE(ACCUMULATE) -FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmAccumulateFixture, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallAccumulateGEMMDataset(), - make("DataType", DataType::F32))) +#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS + +TEST_SUITE(FIXED_FORMAT) +FIXTURE_DATA_TEST_CASE(RunSmall, + CpuGemmFixedFormatFixture, + framework::DatasetMode::PRECOMMIT, + combine( + datasets::SmallGEMMDataset(), + make("DataType", DataType::F32) + )) { // Validate output validate(Accessor(_target), _reference, tolerance_f); } -FIXTURE_DATA_TEST_CASE(RunLarge, CpuGemmAccumulateFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeAccumulateGEMMDataset(), - make("DataType", DataType::F32))) +FIXTURE_DATA_TEST_CASE(RunLarge, + CpuGemmFixedFormatFixture, + framework::DatasetMode::NIGHTLY, + combine( + datasets::LargeGEMMDataset(), + make("DataType", DataType::F32) + )) { // Validate output validate(Accessor(_target), _reference, tolerance_f); } -TEST_SUITE_END() // ACCUMULATE +TEST_SUITE_END() // FIXED_FORMAT +#endif // ARM_COMPUTE_FIXED_FORMAT_KERNELS + TEST_SUITE_END() // FP32 TEST_SUITE_END() // Float diff --git a/tests/validation/runtime/experimental/operators/CpuDequantize.cpp b/tests/validation/runtime/experimental/operators/CpuDequantize.cpp new file mode 100644 index 0000000000..d6c74c4fa2 --- /dev/null +++ b/tests/validation/runtime/experimental/operators/CpuDequantize.cpp @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2017-2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/experimental/operators/CpuDequantize.h" +#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/TensorAllocator.h" +#include "tests/NEON/Accessor.h" +#include "tests/PaddingCalculator.h" +#include "tests/datasets/DatatypeDataset.h" +#include "tests/datasets/ShapeDatasets.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/CpuDequantizeFixture.h" + + +/* + * Tests for arm_compute::experimental::op::CpuDequantize which is a shallow wrapper for arm_compute::cpu::CpuDequantize. + * Any future testing to the functionalities of cpu::CpuDequantize will be tested in tests/NEON/DequantizationLayer.cpp given that experimental::op::CpuDequantize remain a shallow wrapper. +*/ + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ + using framework::dataset::make; +#ifdef ARM_COMPUTE_ENABLE_FP16 +const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 }); +#else /* ARM_COMPUTE_ENABLE_FP16 */ +const auto data_types = framework::dataset::make("DataType", { DataType::F32 }); +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + +const auto dataset_quant_f32 = combine(datasets::SmallShapes(), + datasets::QuantizedTypes(), + make("DataType", DataType::F32), + make("DataLayout", { DataLayout::NCHW }) + ); + +const auto dataset_quant_asymm_signed_f32 = combine(datasets::SmallShapes(), + make("QuantizedTypes", { DataType::QASYMM8_SIGNED }), + make("DataType", DataType::F32), + make("DataLayout", { DataLayout::NCHW }) + ); + +const auto dataset_quant_per_channel_f32 = combine(datasets::SmallShapes(), datasets::QuantizedPerChannelTypes(), + make("DataType", DataType::F32), + make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }) +); + +const auto dataset_precommit_f32 = concat(concat(dataset_quant_f32, dataset_quant_per_channel_f32), dataset_quant_asymm_signed_f32); + + +} // namespace + +TEST_SUITE(NEON) +TEST_SUITE(OPERATORS) +TEST_SUITE(CpuDequantize) + + +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip( + make("InputInfo", { TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32), // Wrong input data type + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8), // Wrong output data type + TensorInfo(TensorShape(16U, 16U, 2U, 5U), 1, DataType::QASYMM8), // Missmatching shapes + TensorInfo(TensorShape(17U, 16U, 16U, 5U), 1, DataType::QASYMM8), // Valid + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8), // Valid + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8_SIGNED), // Valid + }), + make("OutputInfo",{ TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32), + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::U8), + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32), + TensorInfo(TensorShape(17U, 16U, 16U, 5U), 1, DataType::F32), + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32), + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32), + }), + make("Expected", { false, false, false, true, true, true })), + input_info, output_info, expected) +{ + ARM_COMPUTE_EXPECT(bool(arm_compute::experimental::op::CpuDequantize::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS); +} +// // clang-format on + +using arm_compute::experimental::op::CpuDequantize; +template +using CpuDequantizeFixture = CpuDequantizationValidationFixture; + + +FIXTURE_DATA_TEST_CASE(SmokeTest, CpuDequantizeFixture, framework::DatasetMode::ALL, dataset_precommit_f32) +{ + // Validate output + validate(Accessor(_target), _reference); +} + + +TEST_SUITE_END() // CpuDequantize +TEST_SUITE_END() // OPERATORS +TEST_SUITE_END() // NENO +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/runtime/experimental/operators/CpuGEMMLowp.cpp b/tests/validation/runtime/experimental/operators/CpuGEMMLowp.cpp new file mode 100644 index 0000000000..b8e6830700 --- /dev/null +++ b/tests/validation/runtime/experimental/operators/CpuGEMMLowp.cpp @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2017-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/experimental/operators/CpuGEMMLowp.h" +#include "arm_compute/runtime/Tensor.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "tests/NEON/Accessor.h" +#include "tests/datasets/LargeGEMMLowpDataset.h" +#include "tests/datasets/ShapeDatasets.h" +#include "tests/datasets/SmallGEMMLowpDataset.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/fixtures/CpuGEMMLowpFixture.h" + +/* + + * Tests for arm_compute::experimental::op::CpuGEMMLowp which is a shallow wrapper for + * arm_compute::cpu::CpuGemmLowpMatrixMultiplyCore Any future testing to the functionalities of arm_compute::cpu::CpuGemmLowpMatrixMultiplyCore will + * be tested in tests/validation/NEON/GEMMLowp.cpp given that op::CpuGEMMLowp remain a shallow wrapper. +*/ + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +using framework::dataset::make; + +TEST_SUITE(NEON) +TEST_SUITE(OPERATORS) +TEST_SUITE(CpuGEMMLowp) + +using CpuGEMMLowpFixture = CpuGEMMLowpMatrixMultiplyCoreValidationFixture; + +using framework::dataset::make; + +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()), + shape_a, shape_b, shape_c, a_offset, b_offset) +{ + // Create tensors + Tensor a = create_tensor(shape_a, DataType::QASYMM8); + Tensor b = create_tensor(shape_b, DataType::QASYMM8); + Tensor c = create_tensor(shape_c, DataType::S32); + + a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset)); + b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset)); + + ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + arm_compute::experimental::op::CpuGEMMLowp gemmlowp_mm; + gemmlowp_mm.configure(a.info(), b.info(), nullptr, c.info()); + + // Validate padding is zero + validate(a.info()->padding(), PaddingSize()); + validate(b.info()->padding(), PaddingSize()); + validate(c.info()->padding(), PaddingSize()); +} +// accumulation is not supported for Int8/UInt8 in aarch32 +#ifdef __aarch64__ +DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine( + zip( + make("In0",{ TensorShape(21U, 1U) }), + make("In1", { TensorShape(1U, 21U) }), + make("Dst", { TensorShape(1U, 1U) }), + make("a_offset", { -2 }), + make("b_offset", { 13 }) + ), + zip( + make("OutputDataType", { DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED}), + make("Expected", { true, false, false }) + )), + shape_a, shape_b, shape_dst, a_offset, b_offset, output_data_type, expected) +{ + DataType input_data_type = (output_data_type == DataType::S32 ? DataType::QASYMM8 : output_data_type); + // Accumulation test for GEMM kernels + TensorInfo a(shape_a, 1, input_data_type, QuantizationInfo(1.0f / 255, a_offset)); + TensorInfo b(shape_b, 1, input_data_type, QuantizationInfo(1.0f / 255, b_offset)); + TensorInfo dst(shape_dst, 1, output_data_type, QuantizationInfo()); + + // Create and configure function + GEMMInfo gemm_info = GEMMInfo(); + gemm_info.set_accumulate(true); + + if (is_data_type_quantized(output_data_type)) + { + GEMMLowpOutputStageInfo gemmLowpOutputStageInfo = GEMMLowpOutputStageInfo(); + gemmLowpOutputStageInfo.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + + gemm_info.set_gemmlowp_output_stage(gemmLowpOutputStageInfo); + } + + arm_compute::experimental::op::CpuGEMMLowp gemmlowp_mm; + Status status = gemmlowp_mm.validate(&a, &b, nullptr, &dst, gemm_info); + + ARM_COMPUTE_EXPECT((expected == bool(status)), framework::LogLevel::ERRORS); +} +#endif // __arch64__ + +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip( + make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4 + TensorInfo(TensorShape(21U, 13U), 1, DataType::S32), // Mismatching data type + TensorInfo(TensorShape(20U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions + TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions + TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), + TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(1.f/255, 10)), // Invalid types + }), + make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)), + }), + make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32), + TensorInfo(TensorShape(33U, 13U), 1, DataType::S32), + TensorInfo(TensorShape(33U, 13U), 1, DataType::S32), + TensorInfo(TensorShape(8U, 11U), 1, DataType::S32), + TensorInfo(TensorShape(64U, 32U), 1, DataType::S32), + TensorInfo(TensorShape(64U, 32U), 1, DataType::S32), + }), + make("Expected", { true, false, false, false, true, false })), + a_info, b_info, output_info, expected) +{ + // Lock tensors + Status status = arm_compute::experimental::op::CpuGEMMLowp::validate(&a_info.clone()->set_is_resizable(false), + &b_info.clone()->set_is_resizable(false), + nullptr, + &output_info.clone()->set_is_resizable(false)); + ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); +} +// clang-format on + +/** Test case for memory injection in @ref arm_compute::experimental::op::CpuGEMMLowp. + * + * Configure the operator once and inject memory at run-time in multiple executions. + * + * Checks performed in order: + * - Both runs compute the same output + */ +TEST_CASE(MemoryInjection, framework::DatasetMode::ALL) +{ + auto gemm = std::make_unique(); + auto a_info = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8); + auto b_info = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8); + auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32); + a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9)); + b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1)); + const auto gemm_info = GEMMInfo{}; + gemm->configure(&a_info, &b_info, nullptr, &dst_info, gemm_info); + + // The LHS are newly created every call of this lambda function + auto a = create_tensor(a_info); + auto b = create_tensor(b_info); + auto dst = create_tensor(dst_info); + a.allocator()->allocate(); + b.allocator()->allocate(); + dst.allocator()->allocate(); + + ITensorPack run_pack = + { + { TensorType::ACL_SRC_0, &a }, + { TensorType::ACL_SRC_1, &b }, + { TensorType::ACL_DST, &dst } + }; + ITensorPack prep_pack = + { + { TensorType::ACL_SRC_1, &b }, + }; + + auto mg = MemoryGroup{}; + auto ws = manage_workspace(gemm->workspace(), mg, run_pack, prep_pack); + allocate_tensors(gemm->workspace(), ws); + + auto run_gemm = [&]() -> Tensor + { + auto dst = create_tensor(dst_info); + dst.allocator()->allocate(); + run_pack.add_tensor(TensorType::ACL_DST, &dst); + + library->fill_tensor_value(Accessor(a), static_cast(1)); + library->fill_tensor_value(Accessor(b), static_cast(2)); + // This operator is configured once and captured by this lambda. + gemm->prepare(prep_pack); + gemm->run(run_pack); + return dst; + }; + auto result_0 = run_gemm(); + auto result_1 = run_gemm(); + for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i) + { + ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS); + } +} + +FIXTURE_DATA_TEST_CASE(SmokeTest, CpuGEMMLowpFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset()) +{ + // Validate output + validate(Accessor(_target), _reference); +} + + +TEST_SUITE_END() // CpuGEMMLowp +TEST_SUITE_END() // OPERATORS +TEST_SUITE_END() // CpuGEMMLowp +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp b/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp index 9d87a3d2e5..14c8affe27 100644 --- a/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp +++ b/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp @@ -48,9 +48,18 @@ namespace test { namespace validation { +using framework::dataset::make; + namespace { const RelativeTolerance rel_tolerance_f32(0.01f); +#ifdef ARM_COMPUTE_ENABLE_SME +// TODO(COMPMID-6011): SME kernels and the reference model use different rounding mode. +// Temporarily increase the tolerance for quantized data. +constexpr AbsoluteTolerance tolerance_qasymm8(1.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ +#else // ARM_COMPUTE_ENABLE_SME +constexpr AbsoluteTolerance tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ +#endif // ARM_COMPUTE_ENABLE_SME } // namespace TEST_SUITE(NEON) @@ -117,6 +126,8 @@ TEST_CASE(OpCpuGemmConv2dMemoryInjection, framework::DatasetMode::ALL) } using CpuGemmConv2dFixture = CpuGemmConv2dValidationFixture; +template +using CpuGemmConv2dStaticQuantFixture = CpuGemmConv2dForUpdatedStaticQuantInfoAfterConfigureFixture; TEST_SUITE(F32) FIXTURE_DATA_TEST_CASE(SmokeTest, @@ -129,6 +140,48 @@ FIXTURE_DATA_TEST_CASE(SmokeTest, } TEST_SUITE_END() // F32 +#ifdef __aarch64__ + +const auto QuantizedActivationFunctionsDataset = make("ActivationInfo", +{ + ActivationLayerInfo(), + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU), + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f) +}); + +TEST_SUITE(Quantized) + +TEST_SUITE(UpdateStaticQuantInfoAfterConfigure) +TEST_SUITE(QASYMM8_SIGNED) +FIXTURE_DATA_TEST_CASE(SmokeTest, CpuGemmConv2dStaticQuantFixture, framework::DatasetMode::ALL, combine(datasets::TinyConvolutionLayerDataset(), + make("ReshapeWeights", { true }), + make("DataType", DataType::QASYMM8_SIGNED), + make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }), + make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(0.01f, -10) }), + QuantizedActivationFunctionsDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QASYMM8_SIGNED + +TEST_SUITE(QASYMM8) +FIXTURE_DATA_TEST_CASE(SmokeTest, CpuGemmConv2dStaticQuantFixture, framework::DatasetMode::ALL, combine(datasets::TinyConvolutionLayerDataset(), + make("ReshapeWeights", { true }), + make("DataType", DataType::QASYMM8), + make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }), + make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) }), + QuantizedActivationFunctionsDataset)) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QASYMM8 +TEST_SUITE_END() // UpdateStaticQuantInfoAfterConfigure + +TEST_SUITE_END() // Quantized +#endif // __aarch64__ + TEST_SUITE_END() // CpuGemmConv2d TEST_SUITE_END() // OPERATORS TEST_SUITE_END() // NEON diff --git a/tests/validation/runtime/experimental/operators/CpuQuantize.cpp b/tests/validation/runtime/experimental/operators/CpuQuantize.cpp new file mode 100644 index 0000000000..9fd4bb9378 --- /dev/null +++ b/tests/validation/runtime/experimental/operators/CpuQuantize.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2017-2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/experimental/operators/CpuQuantize.h" +#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/TensorAllocator.h" +#include "tests/NEON/Accessor.h" +#include "tests/datasets/ShapeDatasets.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/CpuQuantizeFixture.h" + + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ + +/* + * Tests for arm_compute::experimental::op::CpuQuantize which is a shallow wrapper for + * arm_compute::cpu::CpuQuantization. Any future testing to the functionalities of cpu::CpuQuantize + * will be tested in tests/NEON/QuantizationLayer.cpp given that op::CpuQuantize remain a + * shallow wrapper. +*/ +using arm_compute::experimental::op::CpuQuantize; +using arm_compute::test::validation::CpuQuantizationValidationFixture; +namespace +{ +/** Tolerance for quantization */ +constexpr AbsoluteTolerance tolerance_u8(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8 data types */ +const auto QuantizationSmallShapes = concat(datasets::Small3DShapes(), datasets::Small4DShapes()); +} // namespace + +TEST_SUITE(NEON) +TEST_SUITE(OPERATORS) +TEST_SUITE(CpuQuantize) + +using framework::dataset::make ; + +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip( + make("InputInfo", { TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8), // Wrong output data type + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32), // Wrong output data type + TensorInfo(TensorShape(16U, 16U, 2U, 5U), 1, DataType::F32), // Mismatching shapes + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32), // Valid + }), + make("OutputInfo",{ TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32), + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::U16), + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8), + TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8), + })), + make("Expected", { false, false, false, true})), + input_info, output_info, expected) +{ + ARM_COMPUTE_EXPECT(bool(CpuQuantize::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS); +} +// clang-format on + +template +using CpuQuantizeQASYMM8Fixture = CpuQuantizationValidationFixture; + +FIXTURE_DATA_TEST_CASE(SmokeTest, CpuQuantizeQASYMM8Fixture, framework::DatasetMode::ALL, combine(QuantizationSmallShapes, + make("DataType", DataType::F32), + make("DataTypeOut", { DataType::QASYMM8 }), + make("QuantizationInfo", { QuantizationInfo(0.5f, 10) }) + )) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_u8); +} +TEST_SUITE_END() // CpuQuantize +TEST_SUITE_END() // OPERATORS +TEST_SUITE_END() // NEON +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h index 2d106d849a..08d00fc96a 100644 --- a/utils/TypePrinter.h +++ b/utils/TypePrinter.h @@ -3690,6 +3690,17 @@ inline std::string to_string(const arm_compute::ScatterInfo &info) return str.str(); } +/** Formatted output of the bool data type. + * + * @param[in] info bool type to output. + * + * @return Formatted string + */ +inline std::string to_string(const bool &info) +{ + return info ? "true" : "false"; +} + } // namespace arm_compute #endif // ACL_UTILS_TYPEPRINTER_H